169 lines
5.5 KiB
Python
169 lines
5.5 KiB
Python
|
|
import os, sys
|
||
|
|
from datetime import datetime
|
||
|
|
from dateutil import relativedelta
|
||
|
|
import json
|
||
|
|
|
||
|
|
from selenium.webdriver.common.by import By
|
||
|
|
from lxml import html
|
||
|
|
|
||
|
|
import django
|
||
|
|
sys.path.append('../../../../')
|
||
|
|
os.environ['DJANGO_SETTINGS_MODULE'] = 'config.django.local'
|
||
|
|
django.setup()
|
||
|
|
|
||
|
|
from time import sleep
|
||
|
|
from pprint import pprint as ppr
|
||
|
|
import pytz
|
||
|
|
|
||
|
|
from events.models import Organization, Scraper, Calendar
|
||
|
|
import events.digitools as digitools
|
||
|
|
|
||
|
|
venue, created = Organization.objects.get_or_create(
|
||
|
|
name="Events Medellin",
|
||
|
|
city="Medellin",
|
||
|
|
website="https://eventario.co/events-category/social/",
|
||
|
|
is_venue=True
|
||
|
|
)
|
||
|
|
|
||
|
|
scraper,item_count_start,virtcal = digitools.getScraper(venue, 'mde')
|
||
|
|
|
||
|
|
DATETIME_FORMAT = '%B %d %Y %I:%M%p'
|
||
|
|
DATETIME_FORMAT_2 = '%A, %B %d @ %I%p %Y'
|
||
|
|
|
||
|
|
# with open('data.json') as f:
|
||
|
|
# totalLinks = json.load(f)
|
||
|
|
|
||
|
|
def getLinks(br, url, links):
|
||
|
|
br.get(url)
|
||
|
|
sleep(2)
|
||
|
|
br.execute_script("window.scrollTo(0, window.scrollY + 1500)")
|
||
|
|
sleep(2)
|
||
|
|
x = 1
|
||
|
|
while x == 1:
|
||
|
|
try:
|
||
|
|
div = br.find_element(By.XPATH, ".//*/div[@class='feed-pagination flexify']/a")
|
||
|
|
div.click()
|
||
|
|
sleep(2)
|
||
|
|
br.execute_script("window.scrollTo(0, window.scrollY + 1100)")
|
||
|
|
sleep(2)
|
||
|
|
except:
|
||
|
|
x = 0
|
||
|
|
ps = html.fromstring(br.page_source)
|
||
|
|
newLinks = []
|
||
|
|
# newlinks = ps.xpath('.//*/div[@class="e-con-inner"]/*/a/@href')
|
||
|
|
events = ps.xpath('.//*/div[@class="e-con-inner"]')
|
||
|
|
for event in events:
|
||
|
|
e = {}
|
||
|
|
try:
|
||
|
|
e['link'] = event.xpath('.//*/a/@href')[0]
|
||
|
|
e['title'] = event.xpath('.//*/h3/a/text()')[0]
|
||
|
|
e['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0].replace('\n', '').replace('\t', '')
|
||
|
|
# e['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0]
|
||
|
|
e['venueLink'] = event.xpath('.//*/ul/li/a/@href')[1]
|
||
|
|
label= event.xpath('.//*/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/div/text()')
|
||
|
|
e['label'] = ''.join([x.replace('\t', '').replace('\n', '') for x in label]).strip()
|
||
|
|
newLinks.append(e)
|
||
|
|
except Exception as e:
|
||
|
|
print("Error: ", e)
|
||
|
|
|
||
|
|
links = links + newLinks
|
||
|
|
return links
|
||
|
|
|
||
|
|
if len(sys.argv) >= 2:
|
||
|
|
arg1 = sys.argv[1]
|
||
|
|
br = digitools.getBrowser(arg1)
|
||
|
|
else:
|
||
|
|
print("No run_env")
|
||
|
|
quit()
|
||
|
|
|
||
|
|
urls = ["https://eventario.co/events-category/social/", "https://eventario.co/events-category/musica/", "https://eventario.co/events-category/cultura/"]
|
||
|
|
|
||
|
|
allLinks = []
|
||
|
|
|
||
|
|
for url in urls:
|
||
|
|
allLinks = getLinks(br, url, allLinks)
|
||
|
|
|
||
|
|
totalLinks = list({v['title']:v for v in allLinks}.values())
|
||
|
|
ppr(len(totalLinks))
|
||
|
|
# sortedlinks = allLinks.sort()
|
||
|
|
# ppr(sortedlinks)
|
||
|
|
|
||
|
|
for event in totalLinks:
|
||
|
|
br.get(event['link'])
|
||
|
|
sleep(2)
|
||
|
|
ps = html.fromstring(br.page_source)
|
||
|
|
dateTime= ps.xpath('.//*/div[@class="elementor-element elementor-element-d9beb21 elementor-widget elementor-widget-heading"]/span[@class="elementor-heading-title elementor-size-default"]/text()')
|
||
|
|
event['dateTime'] = [x[3:].split('-')[0].strip() for x in dateTime]
|
||
|
|
locations = ps.xpath('.//*/div[@class="elementor-element elementor-element-f04aae3 elementor-widget__width-initial elementor-widget-mobile__width-initial elementor-widget elementor-widget-ts-advanced-list"]/*/li[@class="elementor-repeater-item-138dbed flexify ts-action"]/a/text()')
|
||
|
|
location = [x.replace('\t', '').replace('\n', '') for x in locations]
|
||
|
|
if len(location) == 2:
|
||
|
|
event['location'] = "{0}, {1}".format(location[1], location[0])
|
||
|
|
else:
|
||
|
|
try:
|
||
|
|
event['location'] = location[0]
|
||
|
|
except:
|
||
|
|
event['location'] = 'n/a'
|
||
|
|
address= ps.xpath('.//*/ul[@class="flexify simplify-ul ts-advanced-list"]/li[@class="elementor-repeater-item-842568d flexify ts-action"]/div/text()')
|
||
|
|
try:
|
||
|
|
event['address'] = [x for x in address if 'Capacidad' not in x and '$' not in x][0]
|
||
|
|
except:
|
||
|
|
event['address'] = address
|
||
|
|
# ppr(event)
|
||
|
|
|
||
|
|
sleep(2)
|
||
|
|
br.close()
|
||
|
|
|
||
|
|
data = totalLinks
|
||
|
|
print("Set:", len(totalLinks))
|
||
|
|
|
||
|
|
paisa = []
|
||
|
|
|
||
|
|
for d in data:
|
||
|
|
if len(d['dateTime']) != 0:
|
||
|
|
if 'Familia' in d['label']:
|
||
|
|
d['category'] = 'Ot'
|
||
|
|
elif 'Comedia' in d['label']:
|
||
|
|
d['category'] = 'Co'
|
||
|
|
elif ('Magic' in d['title']) or ('Juegos' in d['label']):
|
||
|
|
d['category'] = 'Ot'
|
||
|
|
elif ('Conferencias' in d['label']) or ('Intercambio' in d['label']):
|
||
|
|
d['category'] = 'Ed'
|
||
|
|
else:
|
||
|
|
d['category'] = 'Mu'
|
||
|
|
if "Antioquia" in d['location']:
|
||
|
|
try:
|
||
|
|
d['city'] = d['location'].split(',')[0]
|
||
|
|
paisa.append(d)
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
|
||
|
|
cal = Calendar.objects.get(shortcode='mde')
|
||
|
|
|
||
|
|
for d in paisa:
|
||
|
|
d['dateStamp'] =datetime.strptime(d['dateTime'][0], DATETIME_FORMAT)
|
||
|
|
try:
|
||
|
|
nvenue, created = Organization.objects.get_or_create(
|
||
|
|
name=d['venue'],
|
||
|
|
city=d['city'],
|
||
|
|
website=d['venueLink'],
|
||
|
|
address_complete = d['address'],
|
||
|
|
is_venue=True
|
||
|
|
)
|
||
|
|
except:
|
||
|
|
nvenue = Organization.objects.get(name=d['venue'])
|
||
|
|
nvenue.address_complete = d['address']
|
||
|
|
nvenue.save()
|
||
|
|
new_event, created = Event.objects.update_or_create(
|
||
|
|
event_type = d['category'],
|
||
|
|
show_title = d['title'],
|
||
|
|
show_link = d['link'],
|
||
|
|
show_date = d['dateStamp'],
|
||
|
|
show_day = d['dateStamp'],
|
||
|
|
scraper = scraper,
|
||
|
|
venue = nvenue
|
||
|
|
)
|
||
|
|
new_event.calendar.add(cal)
|
||
|
|
new_event.save()
|
||
|
|
print(new_event)
|
||
|
|
|
||
|
|
digitools.updateScraper(scraper, item_count_start)
|