Files
scrapers/Working/venues/manual/eventario.py

199 lines
6.6 KiB
Python
Raw Normal View History

2025-11-30 16:29:50 -05:00
import os, sys
2026-01-24 19:01:47 -05:00
from datetime import datetime, timedelta
2025-11-30 16:29:50 -05:00
from dateutil import relativedelta
import json
from selenium.webdriver.common.by import By
from lxml import html
2026-01-09 22:49:19 -05:00
sys.path.append('/var/www/digisnaxx.ado/scrapers')
import dtss
dtss.getReady()
2025-11-30 16:29:50 -05:00
from time import sleep
from pprint import pprint as ppr
2026-01-24 19:01:47 -05:00
from zoneinfo import ZoneInfo
2025-11-30 16:29:50 -05:00
import pytz
2026-01-24 19:01:47 -05:00
from events.models import Event, Organization, Scraper, Calendar
2025-11-30 16:29:50 -05:00
import events.digitools as digitools
venue, created = Organization.objects.get_or_create(
name="Events Medellin",
city="Medellin",
website="https://eventario.co/events-category/social/",
is_venue=True
)
2026-01-09 22:49:19 -05:00
scraper,item_count_start,virtcal = digitools.getScraper(venue, venue.website, 'mde')
2026-01-24 19:01:47 -05:00
tz_str = " -0500 UTC"
DATETIME_FORMAT = '%d/%m/%y %I:%M %p %z %Z'
# DATETIME_FORMAT = '%B %d %Y %I:%M%p %z %Z'
# DATETIME_FORMAT = '%B %d %Y %I:%M %p %z %Z'
# DATETIME_FORMAT_2 = '%A, %B %d @ %I%p %Y %z %Z'
2025-11-30 16:29:50 -05:00
2026-01-24 19:01:47 -05:00
td = timedelta(days=31)
future_date = datetime.now(ZoneInfo("America/Chicago")) + td
2025-11-30 16:29:50 -05:00
# with open('data.json') as f:
# totalLinks = json.load(f)
def getLinks(br, url, links):
br.get(url)
sleep(2)
br.execute_script("window.scrollTo(0, window.scrollY + 1500)")
sleep(2)
x = 1
while x == 1:
try:
div = br.find_element(By.XPATH, ".//*/div[@class='feed-pagination flexify']/a")
div.click()
sleep(2)
2026-02-19 22:49:47 -05:00
br.execute_script("window.scrollTo(0, window.scrollY + 1375)")
2025-11-30 16:29:50 -05:00
sleep(2)
except:
x = 0
ps = html.fromstring(br.page_source)
newLinks = []
# newlinks = ps.xpath('.//*/div[@class="e-con-inner"]/*/a/@href')
2026-02-19 22:49:47 -05:00
events = ps.xpath('.//*/div[@class="ts-preview"]')
2025-11-30 16:29:50 -05:00
for event in events:
2026-01-24 19:01:47 -05:00
ev = {}
2025-11-30 16:29:50 -05:00
try:
2026-02-19 22:49:47 -05:00
ev['link'] = event.xpath('.//*/h3/a/@href')[0]
2026-01-24 19:01:47 -05:00
ev['title'] = event.xpath('.//*/h3/a/text()')[0]
2026-02-19 22:49:47 -05:00
ev['venue'] = event.xpath('.//*/ul/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/a[@class="ts-action-con"]/text()')[-1:][0].replace('\n', '').replace('\t', '')
ev['venueLink'] = event.xpath('.//*/ul/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/a[@class="ts-action-con"]/@href')[-1:][0]
# ev['venueLink'] = event.xpath('.//*/ul/li/a/@href')[1]
2025-11-30 16:29:50 -05:00
label= event.xpath('.//*/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/div/text()')
2026-01-24 19:01:47 -05:00
ev['label'] = ''.join([x.replace('\t', '').replace('\n', '') for x in label]).strip()
newLinks.append(ev)
2026-02-19 22:49:47 -05:00
# ppr(ev)
2025-11-30 16:29:50 -05:00
except Exception as e:
2026-01-24 19:01:47 -05:00
print("Error: ", ev, e)
2025-11-30 16:29:50 -05:00
links = links + newLinks
return links
2026-02-19 22:49:47 -05:00
2025-11-30 16:29:50 -05:00
if len(sys.argv) >= 2:
arg1 = sys.argv[1]
br = digitools.getBrowser(arg1)
else:
print("No run_env")
quit()
urls = ["https://eventario.co/events-category/social/", "https://eventario.co/events-category/musica/", "https://eventario.co/events-category/cultura/"]
allLinks = []
for url in urls:
allLinks = getLinks(br, url, allLinks)
totalLinks = list({v['title']:v for v in allLinks}.values())
2026-02-19 22:49:47 -05:00
ppr(totalLinks)
2025-11-30 16:29:50 -05:00
ppr(len(totalLinks))
# sortedlinks = allLinks.sort()
# ppr(sortedlinks)
2026-02-19 22:49:47 -05:00
# quit()
for event in totalLinks[175:250]:
2025-11-30 16:29:50 -05:00
br.get(event['link'])
2026-01-24 19:01:47 -05:00
sleep(1)
2025-11-30 16:29:50 -05:00
ps = html.fromstring(br.page_source)
dateTime= ps.xpath('.//*/div[@class="elementor-element elementor-element-d9beb21 elementor-widget elementor-widget-heading"]/span[@class="elementor-heading-title elementor-size-default"]/text()')
event['dateTime'] = [x[3:].split('-')[0].strip() for x in dateTime]
try:
2026-01-24 19:01:47 -05:00
event['dateStamp'] = datetime.strptime(event['dateTime'][0] +tz_str, DATETIME_FORMAT)
if future_date <= event['dateStamp']:
print("Future Date")
pass
else:
locations = ps.xpath('.//*/div[@class="elementor-element elementor-element-f04aae3 elementor-widget__width-initial elementor-widget-mobile__width-initial elementor-widget elementor-widget-ts-advanced-list"]/*/li[@class="elementor-repeater-item-138dbed flexify ts-action"]/a/text()')
location = [x.replace('\t', '').replace('\n', '') for x in locations]
if len(location) == 2:
event['location'] = "{0}, {1}".format(location[1], location[0])
else:
try:
event['location'] = location[0]
except:
event['location'] = 'n/a'
address= ps.xpath('.//*/ul[@class="flexify simplify-ul ts-advanced-list"]/li[@class="elementor-repeater-item-842568d flexify ts-action"]/div/text()')
try:
event['address'] = [x for x in address if 'Capacidad' not in x and '$' not in x][0]
except:
event['address'] = address
except Exception as e:
print("Error: ", event, e)
pass
2025-11-30 16:29:50 -05:00
# ppr(event)
2026-01-24 19:01:47 -05:00
sleep(1)
2025-11-30 16:29:50 -05:00
br.close()
2026-01-24 19:01:47 -05:00
data = [i for i in totalLinks if 'dateStamp' in i]
new_data = [i for i in data if 'location' in i]
2025-11-30 16:29:50 -05:00
print("Set:", len(totalLinks))
2026-01-24 19:01:47 -05:00
print("Data Set:", len(data))
print("New Data Set:", len(new_data))
2025-11-30 16:29:50 -05:00
paisa = []
2026-01-24 19:01:47 -05:00
for d in new_data:
2025-11-30 16:29:50 -05:00
if len(d['dateTime']) != 0:
if 'Familia' in d['label']:
d['category'] = 'Ot'
elif 'Comedia' in d['label']:
d['category'] = 'Co'
elif ('Magic' in d['title']) or ('Juegos' in d['label']):
d['category'] = 'Ot'
elif ('Conferencias' in d['label']) or ('Intercambio' in d['label']):
d['category'] = 'Ed'
else:
d['category'] = 'Mu'
if "Antioquia" in d['location']:
try:
d['city'] = d['location'].split(',')[0]
paisa.append(d)
except:
continue
cal = Calendar.objects.get(shortcode='mde')
2026-02-19 22:49:47 -05:00
print("run paisa")
ppr(paisa)
2025-11-30 16:29:50 -05:00
for d in paisa:
try:
nvenue, created = Organization.objects.get_or_create(
name=d['venue'],
city=d['city'],
website=d['venueLink'],
address_complete = d['address'],
is_venue=True
)
except:
nvenue = Organization.objects.get(name=d['venue'])
nvenue.address_complete = d['address']
nvenue.save()
2026-01-24 19:01:47 -05:00
2026-02-19 22:49:47 -05:00
try:
new_event, created = Event.objects.update_or_create(
event_type = d['category'],
show_title = d['title'],
show_link = d['link'],
show_date = d['dateStamp'],
scraper = scraper,
venue = nvenue
)
new_event.calendar.add(cal)
new_event.save()
print(new_event, created, new_event.scraper)
# print("Success:", new_event)
except Execption as e:
print(e)
2025-11-30 16:29:50 -05:00
digitools.updateScraper(scraper, item_count_start)