more scrapers

This commit is contained in:
2026-01-24 19:01:47 -05:00
parent 7013d8327a
commit 3c4a41ae2c
58 changed files with 1614 additions and 2988 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,5 @@
import os, sys
from datetime import datetime
from datetime import datetime, timedelta
from dateutil import relativedelta
import json
@@ -12,9 +12,10 @@ dtss.getReady()
from time import sleep
from pprint import pprint as ppr
from zoneinfo import ZoneInfo
import pytz
from events.models import Organization, Scraper, Calendar
from events.models import Event, Organization, Scraper, Calendar
import events.digitools as digitools
venue, created = Organization.objects.get_or_create(
@@ -25,9 +26,14 @@ venue, created = Organization.objects.get_or_create(
)
scraper,item_count_start,virtcal = digitools.getScraper(venue, venue.website, 'mde')
tz_str = " -0500 UTC"
DATETIME_FORMAT = '%d/%m/%y %I:%M %p %z %Z'
# DATETIME_FORMAT = '%B %d %Y %I:%M%p %z %Z'
# DATETIME_FORMAT = '%B %d %Y %I:%M %p %z %Z'
# DATETIME_FORMAT_2 = '%A, %B %d @ %I%p %Y %z %Z'
DATETIME_FORMAT = '%B %d %Y %I:%M%p'
DATETIME_FORMAT_2 = '%A, %B %d @ %I%p %Y'
td = timedelta(days=31)
future_date = datetime.now(ZoneInfo("America/Chicago")) + td
# with open('data.json') as f:
# totalLinks = json.load(f)
@@ -52,18 +58,18 @@ def getLinks(br, url, links):
# newlinks = ps.xpath('.//*/div[@class="e-con-inner"]/*/a/@href')
events = ps.xpath('.//*/div[@class="e-con-inner"]')
for event in events:
e = {}
ev = {}
try:
e['link'] = event.xpath('.//*/a/@href')[0]
e['title'] = event.xpath('.//*/h3/a/text()')[0]
e['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0].replace('\n', '').replace('\t', '')
ev['link'] = event.xpath('.//*/a/@href')[0]
ev['title'] = event.xpath('.//*/h3/a/text()')[0]
ev['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0].replace('\n', '').replace('\t', '')
# e['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0]
e['venueLink'] = event.xpath('.//*/ul/li/a/@href')[1]
ev['venueLink'] = event.xpath('.//*/ul/li/a/@href')[1]
label= event.xpath('.//*/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/div/text()')
e['label'] = ''.join([x.replace('\t', '').replace('\n', '') for x in label]).strip()
newLinks.append(e)
ev['label'] = ''.join([x.replace('\t', '').replace('\n', '') for x in label]).strip()
newLinks.append(ev)
except Exception as e:
print("Error: ", e)
print("Error: ", ev, e)
links = links + newLinks
return links
@@ -89,35 +95,47 @@ ppr(len(totalLinks))
for event in totalLinks:
br.get(event['link'])
sleep(2)
sleep(1)
ps = html.fromstring(br.page_source)
dateTime= ps.xpath('.//*/div[@class="elementor-element elementor-element-d9beb21 elementor-widget elementor-widget-heading"]/span[@class="elementor-heading-title elementor-size-default"]/text()')
event['dateTime'] = [x[3:].split('-')[0].strip() for x in dateTime]
locations = ps.xpath('.//*/div[@class="elementor-element elementor-element-f04aae3 elementor-widget__width-initial elementor-widget-mobile__width-initial elementor-widget elementor-widget-ts-advanced-list"]/*/li[@class="elementor-repeater-item-138dbed flexify ts-action"]/a/text()')
location = [x.replace('\t', '').replace('\n', '') for x in locations]
if len(location) == 2:
event['location'] = "{0}, {1}".format(location[1], location[0])
else:
try:
event['location'] = location[0]
except:
event['location'] = 'n/a'
address= ps.xpath('.//*/ul[@class="flexify simplify-ul ts-advanced-list"]/li[@class="elementor-repeater-item-842568d flexify ts-action"]/div/text()')
try:
event['address'] = [x for x in address if 'Capacidad' not in x and '$' not in x][0]
except:
event['address'] = address
event['dateStamp'] = datetime.strptime(event['dateTime'][0] +tz_str, DATETIME_FORMAT)
if future_date <= event['dateStamp']:
print("Future Date")
pass
else:
locations = ps.xpath('.//*/div[@class="elementor-element elementor-element-f04aae3 elementor-widget__width-initial elementor-widget-mobile__width-initial elementor-widget elementor-widget-ts-advanced-list"]/*/li[@class="elementor-repeater-item-138dbed flexify ts-action"]/a/text()')
location = [x.replace('\t', '').replace('\n', '') for x in locations]
if len(location) == 2:
event['location'] = "{0}, {1}".format(location[1], location[0])
else:
try:
event['location'] = location[0]
except:
event['location'] = 'n/a'
address= ps.xpath('.//*/ul[@class="flexify simplify-ul ts-advanced-list"]/li[@class="elementor-repeater-item-842568d flexify ts-action"]/div/text()')
try:
event['address'] = [x for x in address if 'Capacidad' not in x and '$' not in x][0]
except:
event['address'] = address
except Exception as e:
print("Error: ", event, e)
pass
# ppr(event)
sleep(2)
sleep(1)
br.close()
data = totalLinks
data = [i for i in totalLinks if 'dateStamp' in i]
new_data = [i for i in data if 'location' in i]
print("Set:", len(totalLinks))
print("Data Set:", len(data))
print("New Data Set:", len(new_data))
paisa = []
for d in data:
for d in new_data:
if len(d['dateTime']) != 0:
if 'Familia' in d['label']:
d['category'] = 'Ot'
@@ -139,7 +157,6 @@ for d in data:
cal = Calendar.objects.get(shortcode='mde')
for d in paisa:
d['dateStamp'] =datetime.strptime(d['dateTime'][0], DATETIME_FORMAT)
try:
nvenue, created = Organization.objects.get_or_create(
name=d['venue'],
@@ -152,6 +169,7 @@ for d in paisa:
nvenue = Organization.objects.get(name=d['venue'])
nvenue.address_complete = d['address']
nvenue.save()
new_event, created = Event.objects.update_or_create(
event_type = d['category'],
show_title = d['title'],
@@ -163,6 +181,6 @@ for d in paisa:
)
new_event.calendar.add(cal)
new_event.save()
print(new_event)
# print("Success:", new_event)
digitools.updateScraper(scraper, item_count_start)