more scrapers
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
import os, sys
|
||||
from datetime import datetime
|
||||
from datetime import datetime, timedelta
|
||||
from dateutil import relativedelta
|
||||
import json
|
||||
|
||||
@@ -12,9 +12,10 @@ dtss.getReady()
|
||||
|
||||
from time import sleep
|
||||
from pprint import pprint as ppr
|
||||
from zoneinfo import ZoneInfo
|
||||
import pytz
|
||||
|
||||
from events.models import Organization, Scraper, Calendar
|
||||
from events.models import Event, Organization, Scraper, Calendar
|
||||
import events.digitools as digitools
|
||||
|
||||
venue, created = Organization.objects.get_or_create(
|
||||
@@ -25,9 +26,14 @@ venue, created = Organization.objects.get_or_create(
|
||||
)
|
||||
|
||||
scraper,item_count_start,virtcal = digitools.getScraper(venue, venue.website, 'mde')
|
||||
tz_str = " -0500 UTC"
|
||||
DATETIME_FORMAT = '%d/%m/%y %I:%M %p %z %Z'
|
||||
# DATETIME_FORMAT = '%B %d %Y %I:%M%p %z %Z'
|
||||
# DATETIME_FORMAT = '%B %d %Y %I:%M %p %z %Z'
|
||||
# DATETIME_FORMAT_2 = '%A, %B %d @ %I%p %Y %z %Z'
|
||||
|
||||
DATETIME_FORMAT = '%B %d %Y %I:%M%p'
|
||||
DATETIME_FORMAT_2 = '%A, %B %d @ %I%p %Y'
|
||||
td = timedelta(days=31)
|
||||
future_date = datetime.now(ZoneInfo("America/Chicago")) + td
|
||||
|
||||
# with open('data.json') as f:
|
||||
# totalLinks = json.load(f)
|
||||
@@ -52,18 +58,18 @@ def getLinks(br, url, links):
|
||||
# newlinks = ps.xpath('.//*/div[@class="e-con-inner"]/*/a/@href')
|
||||
events = ps.xpath('.//*/div[@class="e-con-inner"]')
|
||||
for event in events:
|
||||
e = {}
|
||||
ev = {}
|
||||
try:
|
||||
e['link'] = event.xpath('.//*/a/@href')[0]
|
||||
e['title'] = event.xpath('.//*/h3/a/text()')[0]
|
||||
e['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0].replace('\n', '').replace('\t', '')
|
||||
ev['link'] = event.xpath('.//*/a/@href')[0]
|
||||
ev['title'] = event.xpath('.//*/h3/a/text()')[0]
|
||||
ev['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0].replace('\n', '').replace('\t', '')
|
||||
# e['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0]
|
||||
e['venueLink'] = event.xpath('.//*/ul/li/a/@href')[1]
|
||||
ev['venueLink'] = event.xpath('.//*/ul/li/a/@href')[1]
|
||||
label= event.xpath('.//*/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/div/text()')
|
||||
e['label'] = ''.join([x.replace('\t', '').replace('\n', '') for x in label]).strip()
|
||||
newLinks.append(e)
|
||||
ev['label'] = ''.join([x.replace('\t', '').replace('\n', '') for x in label]).strip()
|
||||
newLinks.append(ev)
|
||||
except Exception as e:
|
||||
print("Error: ", e)
|
||||
print("Error: ", ev, e)
|
||||
|
||||
links = links + newLinks
|
||||
return links
|
||||
@@ -89,35 +95,47 @@ ppr(len(totalLinks))
|
||||
|
||||
for event in totalLinks:
|
||||
br.get(event['link'])
|
||||
sleep(2)
|
||||
sleep(1)
|
||||
ps = html.fromstring(br.page_source)
|
||||
dateTime= ps.xpath('.//*/div[@class="elementor-element elementor-element-d9beb21 elementor-widget elementor-widget-heading"]/span[@class="elementor-heading-title elementor-size-default"]/text()')
|
||||
event['dateTime'] = [x[3:].split('-')[0].strip() for x in dateTime]
|
||||
locations = ps.xpath('.//*/div[@class="elementor-element elementor-element-f04aae3 elementor-widget__width-initial elementor-widget-mobile__width-initial elementor-widget elementor-widget-ts-advanced-list"]/*/li[@class="elementor-repeater-item-138dbed flexify ts-action"]/a/text()')
|
||||
location = [x.replace('\t', '').replace('\n', '') for x in locations]
|
||||
if len(location) == 2:
|
||||
event['location'] = "{0}, {1}".format(location[1], location[0])
|
||||
else:
|
||||
try:
|
||||
event['location'] = location[0]
|
||||
except:
|
||||
event['location'] = 'n/a'
|
||||
address= ps.xpath('.//*/ul[@class="flexify simplify-ul ts-advanced-list"]/li[@class="elementor-repeater-item-842568d flexify ts-action"]/div/text()')
|
||||
try:
|
||||
event['address'] = [x for x in address if 'Capacidad' not in x and '$' not in x][0]
|
||||
except:
|
||||
event['address'] = address
|
||||
event['dateStamp'] = datetime.strptime(event['dateTime'][0] +tz_str, DATETIME_FORMAT)
|
||||
if future_date <= event['dateStamp']:
|
||||
print("Future Date")
|
||||
pass
|
||||
else:
|
||||
locations = ps.xpath('.//*/div[@class="elementor-element elementor-element-f04aae3 elementor-widget__width-initial elementor-widget-mobile__width-initial elementor-widget elementor-widget-ts-advanced-list"]/*/li[@class="elementor-repeater-item-138dbed flexify ts-action"]/a/text()')
|
||||
location = [x.replace('\t', '').replace('\n', '') for x in locations]
|
||||
if len(location) == 2:
|
||||
event['location'] = "{0}, {1}".format(location[1], location[0])
|
||||
else:
|
||||
try:
|
||||
event['location'] = location[0]
|
||||
except:
|
||||
event['location'] = 'n/a'
|
||||
address= ps.xpath('.//*/ul[@class="flexify simplify-ul ts-advanced-list"]/li[@class="elementor-repeater-item-842568d flexify ts-action"]/div/text()')
|
||||
try:
|
||||
event['address'] = [x for x in address if 'Capacidad' not in x and '$' not in x][0]
|
||||
except:
|
||||
event['address'] = address
|
||||
except Exception as e:
|
||||
print("Error: ", event, e)
|
||||
pass
|
||||
# ppr(event)
|
||||
|
||||
sleep(2)
|
||||
sleep(1)
|
||||
br.close()
|
||||
|
||||
data = totalLinks
|
||||
data = [i for i in totalLinks if 'dateStamp' in i]
|
||||
new_data = [i for i in data if 'location' in i]
|
||||
print("Set:", len(totalLinks))
|
||||
print("Data Set:", len(data))
|
||||
print("New Data Set:", len(new_data))
|
||||
|
||||
paisa = []
|
||||
|
||||
for d in data:
|
||||
for d in new_data:
|
||||
if len(d['dateTime']) != 0:
|
||||
if 'Familia' in d['label']:
|
||||
d['category'] = 'Ot'
|
||||
@@ -139,7 +157,6 @@ for d in data:
|
||||
cal = Calendar.objects.get(shortcode='mde')
|
||||
|
||||
for d in paisa:
|
||||
d['dateStamp'] =datetime.strptime(d['dateTime'][0], DATETIME_FORMAT)
|
||||
try:
|
||||
nvenue, created = Organization.objects.get_or_create(
|
||||
name=d['venue'],
|
||||
@@ -152,6 +169,7 @@ for d in paisa:
|
||||
nvenue = Organization.objects.get(name=d['venue'])
|
||||
nvenue.address_complete = d['address']
|
||||
nvenue.save()
|
||||
|
||||
new_event, created = Event.objects.update_or_create(
|
||||
event_type = d['category'],
|
||||
show_title = d['title'],
|
||||
@@ -163,6 +181,6 @@ for d in paisa:
|
||||
)
|
||||
new_event.calendar.add(cal)
|
||||
new_event.save()
|
||||
print(new_event)
|
||||
# print("Success:", new_event)
|
||||
|
||||
digitools.updateScraper(scraper, item_count_start)
|
||||
Reference in New Issue
Block a user