Files
scrapers/Working/venues/manual/eventario.py
2026-01-24 19:01:47 -05:00

186 lines
6.3 KiB
Python

import os, sys
from datetime import datetime, timedelta
from dateutil import relativedelta
import json
from selenium.webdriver.common.by import By
from lxml import html
sys.path.append('/var/www/digisnaxx.ado/scrapers')
import dtss
dtss.getReady()
from time import sleep
from pprint import pprint as ppr
from zoneinfo import ZoneInfo
import pytz
from events.models import Event, Organization, Scraper, Calendar
import events.digitools as digitools
venue, created = Organization.objects.get_or_create(
name="Events Medellin",
city="Medellin",
website="https://eventario.co/events-category/social/",
is_venue=True
)
scraper,item_count_start,virtcal = digitools.getScraper(venue, venue.website, 'mde')
tz_str = " -0500 UTC"
DATETIME_FORMAT = '%d/%m/%y %I:%M %p %z %Z'
# DATETIME_FORMAT = '%B %d %Y %I:%M%p %z %Z'
# DATETIME_FORMAT = '%B %d %Y %I:%M %p %z %Z'
# DATETIME_FORMAT_2 = '%A, %B %d @ %I%p %Y %z %Z'
td = timedelta(days=31)
future_date = datetime.now(ZoneInfo("America/Chicago")) + td
# with open('data.json') as f:
# totalLinks = json.load(f)
def getLinks(br, url, links):
br.get(url)
sleep(2)
br.execute_script("window.scrollTo(0, window.scrollY + 1500)")
sleep(2)
x = 1
while x == 1:
try:
div = br.find_element(By.XPATH, ".//*/div[@class='feed-pagination flexify']/a")
div.click()
sleep(2)
br.execute_script("window.scrollTo(0, window.scrollY + 1100)")
sleep(2)
except:
x = 0
ps = html.fromstring(br.page_source)
newLinks = []
# newlinks = ps.xpath('.//*/div[@class="e-con-inner"]/*/a/@href')
events = ps.xpath('.//*/div[@class="e-con-inner"]')
for event in events:
ev = {}
try:
ev['link'] = event.xpath('.//*/a/@href')[0]
ev['title'] = event.xpath('.//*/h3/a/text()')[0]
ev['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0].replace('\n', '').replace('\t', '')
# e['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0]
ev['venueLink'] = event.xpath('.//*/ul/li/a/@href')[1]
label= event.xpath('.//*/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/div/text()')
ev['label'] = ''.join([x.replace('\t', '').replace('\n', '') for x in label]).strip()
newLinks.append(ev)
except Exception as e:
print("Error: ", ev, e)
links = links + newLinks
return links
if len(sys.argv) >= 2:
arg1 = sys.argv[1]
br = digitools.getBrowser(arg1)
else:
print("No run_env")
quit()
urls = ["https://eventario.co/events-category/social/", "https://eventario.co/events-category/musica/", "https://eventario.co/events-category/cultura/"]
allLinks = []
for url in urls:
allLinks = getLinks(br, url, allLinks)
totalLinks = list({v['title']:v for v in allLinks}.values())
ppr(len(totalLinks))
# sortedlinks = allLinks.sort()
# ppr(sortedlinks)
for event in totalLinks:
br.get(event['link'])
sleep(1)
ps = html.fromstring(br.page_source)
dateTime= ps.xpath('.//*/div[@class="elementor-element elementor-element-d9beb21 elementor-widget elementor-widget-heading"]/span[@class="elementor-heading-title elementor-size-default"]/text()')
event['dateTime'] = [x[3:].split('-')[0].strip() for x in dateTime]
try:
event['dateStamp'] = datetime.strptime(event['dateTime'][0] +tz_str, DATETIME_FORMAT)
if future_date <= event['dateStamp']:
print("Future Date")
pass
else:
locations = ps.xpath('.//*/div[@class="elementor-element elementor-element-f04aae3 elementor-widget__width-initial elementor-widget-mobile__width-initial elementor-widget elementor-widget-ts-advanced-list"]/*/li[@class="elementor-repeater-item-138dbed flexify ts-action"]/a/text()')
location = [x.replace('\t', '').replace('\n', '') for x in locations]
if len(location) == 2:
event['location'] = "{0}, {1}".format(location[1], location[0])
else:
try:
event['location'] = location[0]
except:
event['location'] = 'n/a'
address= ps.xpath('.//*/ul[@class="flexify simplify-ul ts-advanced-list"]/li[@class="elementor-repeater-item-842568d flexify ts-action"]/div/text()')
try:
event['address'] = [x for x in address if 'Capacidad' not in x and '$' not in x][0]
except:
event['address'] = address
except Exception as e:
print("Error: ", event, e)
pass
# ppr(event)
sleep(1)
br.close()
data = [i for i in totalLinks if 'dateStamp' in i]
new_data = [i for i in data if 'location' in i]
print("Set:", len(totalLinks))
print("Data Set:", len(data))
print("New Data Set:", len(new_data))
paisa = []
for d in new_data:
if len(d['dateTime']) != 0:
if 'Familia' in d['label']:
d['category'] = 'Ot'
elif 'Comedia' in d['label']:
d['category'] = 'Co'
elif ('Magic' in d['title']) or ('Juegos' in d['label']):
d['category'] = 'Ot'
elif ('Conferencias' in d['label']) or ('Intercambio' in d['label']):
d['category'] = 'Ed'
else:
d['category'] = 'Mu'
if "Antioquia" in d['location']:
try:
d['city'] = d['location'].split(',')[0]
paisa.append(d)
except:
continue
cal = Calendar.objects.get(shortcode='mde')
for d in paisa:
try:
nvenue, created = Organization.objects.get_or_create(
name=d['venue'],
city=d['city'],
website=d['venueLink'],
address_complete = d['address'],
is_venue=True
)
except:
nvenue = Organization.objects.get(name=d['venue'])
nvenue.address_complete = d['address']
nvenue.save()
new_event, created = Event.objects.update_or_create(
event_type = d['category'],
show_title = d['title'],
show_link = d['link'],
show_date = d['dateStamp'],
show_day = d['dateStamp'],
scraper = scraper,
venue = nvenue
)
new_event.calendar.add(cal)
new_event.save()
# print("Success:", new_event)
digitools.updateScraper(scraper, item_count_start)