import os, sys from datetime import datetime, timedelta from dateutil import relativedelta import json from selenium.webdriver.common.by import By from lxml import html sys.path.append('/var/www/digisnaxx.ado/scrapers') import dtss dtss.getReady() from time import sleep from pprint import pprint as ppr from zoneinfo import ZoneInfo import pytz from events.models import Event, Organization, Scraper, Calendar import events.digitools as digitools venue, created = Organization.objects.get_or_create( name="Events Medellin", city="Medellin", website="https://eventario.co/events-category/social/", is_venue=True ) scraper,item_count_start,virtcal = digitools.getScraper(venue, venue.website, 'mde') tz_str = " -0500 UTC" DATETIME_FORMAT = '%d/%m/%y %I:%M %p %z %Z' # DATETIME_FORMAT = '%B %d %Y %I:%M%p %z %Z' # DATETIME_FORMAT = '%B %d %Y %I:%M %p %z %Z' # DATETIME_FORMAT_2 = '%A, %B %d @ %I%p %Y %z %Z' td = timedelta(days=31) future_date = datetime.now(ZoneInfo("America/Chicago")) + td # with open('data.json') as f: # totalLinks = json.load(f) def getLinks(br, url, links): br.get(url) sleep(2) br.execute_script("window.scrollTo(0, window.scrollY + 1500)") sleep(2) x = 1 while x == 1: try: div = br.find_element(By.XPATH, ".//*/div[@class='feed-pagination flexify']/a") div.click() sleep(2) br.execute_script("window.scrollTo(0, window.scrollY + 1375)") sleep(2) except: x = 0 ps = html.fromstring(br.page_source) newLinks = [] # newlinks = ps.xpath('.//*/div[@class="e-con-inner"]/*/a/@href') events = ps.xpath('.//*/div[@class="ts-preview"]') for event in events: ev = {} try: ev['link'] = event.xpath('.//*/h3/a/@href')[0] ev['title'] = event.xpath('.//*/h3/a/text()')[0] ev['venue'] = event.xpath('.//*/ul/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/a[@class="ts-action-con"]/text()')[-1:][0].replace('\n', '').replace('\t', '') ev['venueLink'] = event.xpath('.//*/ul/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/a[@class="ts-action-con"]/@href')[-1:][0] # ev['venueLink'] = event.xpath('.//*/ul/li/a/@href')[1] label= event.xpath('.//*/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/div/text()') ev['label'] = ''.join([x.replace('\t', '').replace('\n', '') for x in label]).strip() newLinks.append(ev) # ppr(ev) except Exception as e: print("Error: ", ev, e) links = links + newLinks return links if len(sys.argv) >= 2: arg1 = sys.argv[1] br = digitools.getBrowser(arg1) else: print("No run_env") quit() urls = ["https://eventario.co/events-category/social/", "https://eventario.co/events-category/musica/", "https://eventario.co/events-category/cultura/"] allLinks = [] for url in urls: allLinks = getLinks(br, url, allLinks) totalLinks = list({v['title']:v for v in allLinks}.values()) ppr(totalLinks) ppr(len(totalLinks)) # sortedlinks = allLinks.sort() # ppr(sortedlinks) # quit() for event in totalLinks[175:250]: br.get(event['link']) sleep(1) ps = html.fromstring(br.page_source) dateTime= ps.xpath('.//*/div[@class="elementor-element elementor-element-d9beb21 elementor-widget elementor-widget-heading"]/span[@class="elementor-heading-title elementor-size-default"]/text()') event['dateTime'] = [x[3:].split('-')[0].strip() for x in dateTime] try: event['dateStamp'] = datetime.strptime(event['dateTime'][0] +tz_str, DATETIME_FORMAT) if future_date <= event['dateStamp']: print("Future Date") pass else: locations = ps.xpath('.//*/div[@class="elementor-element elementor-element-f04aae3 elementor-widget__width-initial elementor-widget-mobile__width-initial elementor-widget elementor-widget-ts-advanced-list"]/*/li[@class="elementor-repeater-item-138dbed flexify ts-action"]/a/text()') location = [x.replace('\t', '').replace('\n', '') for x in locations] if len(location) == 2: event['location'] = "{0}, {1}".format(location[1], location[0]) else: try: event['location'] = location[0] except: event['location'] = 'n/a' address= ps.xpath('.//*/ul[@class="flexify simplify-ul ts-advanced-list"]/li[@class="elementor-repeater-item-842568d flexify ts-action"]/div/text()') try: event['address'] = [x for x in address if 'Capacidad' not in x and '$' not in x][0] except: event['address'] = address except Exception as e: print("Error: ", event, e) pass # ppr(event) sleep(1) br.close() data = [i for i in totalLinks if 'dateStamp' in i] new_data = [i for i in data if 'location' in i] print("Set:", len(totalLinks)) print("Data Set:", len(data)) print("New Data Set:", len(new_data)) paisa = [] for d in new_data: if len(d['dateTime']) != 0: if 'Familia' in d['label']: d['category'] = 'Ot' elif 'Comedia' in d['label']: d['category'] = 'Co' elif ('Magic' in d['title']) or ('Juegos' in d['label']): d['category'] = 'Ot' elif ('Conferencias' in d['label']) or ('Intercambio' in d['label']): d['category'] = 'Ed' else: d['category'] = 'Mu' if "Antioquia" in d['location']: try: d['city'] = d['location'].split(',')[0] paisa.append(d) except: continue cal = Calendar.objects.get(shortcode='mde') print("run paisa") ppr(paisa) for d in paisa: try: nvenue, created = Organization.objects.get_or_create( name=d['venue'], city=d['city'], website=d['venueLink'], address_complete = d['address'], is_venue=True ) except: nvenue = Organization.objects.get(name=d['venue']) nvenue.address_complete = d['address'] nvenue.save() try: new_event, created = Event.objects.update_or_create( event_type = d['category'], show_title = d['title'], show_link = d['link'], show_date = d['dateStamp'], scraper = scraper, venue = nvenue ) new_event.calendar.add(cal) new_event.save() print(new_event, created, new_event.scraper) # print("Success:", new_event) except Execption as e: print(e) digitools.updateScraper(scraper, item_count_start)