import os, sys from datetime import datetime from dateutil import relativedelta import json from selenium.webdriver.common.by import By from lxml import html import django sys.path.append('../../../../') os.environ['DJANGO_SETTINGS_MODULE'] = 'config.django.local' django.setup() from time import sleep from pprint import pprint as ppr import pytz from events.models import Organization, Scraper, Calendar import events.digitools as digitools venue, created = Organization.objects.get_or_create( name="Events Medellin", city="Medellin", website="https://eventario.co/events-category/social/", is_venue=True ) scraper,item_count_start,virtcal = digitools.getScraper(venue, 'mde') DATETIME_FORMAT = '%B %d %Y %I:%M%p' DATETIME_FORMAT_2 = '%A, %B %d @ %I%p %Y' # with open('data.json') as f: # totalLinks = json.load(f) def getLinks(br, url, links): br.get(url) sleep(2) br.execute_script("window.scrollTo(0, window.scrollY + 1500)") sleep(2) x = 1 while x == 1: try: div = br.find_element(By.XPATH, ".//*/div[@class='feed-pagination flexify']/a") div.click() sleep(2) br.execute_script("window.scrollTo(0, window.scrollY + 1100)") sleep(2) except: x = 0 ps = html.fromstring(br.page_source) newLinks = [] # newlinks = ps.xpath('.//*/div[@class="e-con-inner"]/*/a/@href') events = ps.xpath('.//*/div[@class="e-con-inner"]') for event in events: e = {} try: e['link'] = event.xpath('.//*/a/@href')[0] e['title'] = event.xpath('.//*/h3/a/text()')[0] e['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0].replace('\n', '').replace('\t', '') # e['venue'] = event.xpath('.//*/ul/li/a/text()')[-1:][0] e['venueLink'] = event.xpath('.//*/ul/li/a/@href')[1] label= event.xpath('.//*/li[@class="elementor-repeater-item-46edd7d flexify ts-action"]/div/text()') e['label'] = ''.join([x.replace('\t', '').replace('\n', '') for x in label]).strip() newLinks.append(e) except Exception as e: print("Error: ", e) links = links + newLinks return links if len(sys.argv) >= 2: arg1 = sys.argv[1] br = digitools.getBrowser(arg1) else: print("No run_env") quit() urls = ["https://eventario.co/events-category/social/", "https://eventario.co/events-category/musica/", "https://eventario.co/events-category/cultura/"] allLinks = [] for url in urls: allLinks = getLinks(br, url, allLinks) totalLinks = list({v['title']:v for v in allLinks}.values()) ppr(len(totalLinks)) # sortedlinks = allLinks.sort() # ppr(sortedlinks) for event in totalLinks: br.get(event['link']) sleep(2) ps = html.fromstring(br.page_source) dateTime= ps.xpath('.//*/div[@class="elementor-element elementor-element-d9beb21 elementor-widget elementor-widget-heading"]/span[@class="elementor-heading-title elementor-size-default"]/text()') event['dateTime'] = [x[3:].split('-')[0].strip() for x in dateTime] locations = ps.xpath('.//*/div[@class="elementor-element elementor-element-f04aae3 elementor-widget__width-initial elementor-widget-mobile__width-initial elementor-widget elementor-widget-ts-advanced-list"]/*/li[@class="elementor-repeater-item-138dbed flexify ts-action"]/a/text()') location = [x.replace('\t', '').replace('\n', '') for x in locations] if len(location) == 2: event['location'] = "{0}, {1}".format(location[1], location[0]) else: try: event['location'] = location[0] except: event['location'] = 'n/a' address= ps.xpath('.//*/ul[@class="flexify simplify-ul ts-advanced-list"]/li[@class="elementor-repeater-item-842568d flexify ts-action"]/div/text()') try: event['address'] = [x for x in address if 'Capacidad' not in x and '$' not in x][0] except: event['address'] = address # ppr(event) sleep(2) br.close() data = totalLinks print("Set:", len(totalLinks)) paisa = [] for d in data: if len(d['dateTime']) != 0: if 'Familia' in d['label']: d['category'] = 'Ot' elif 'Comedia' in d['label']: d['category'] = 'Co' elif ('Magic' in d['title']) or ('Juegos' in d['label']): d['category'] = 'Ot' elif ('Conferencias' in d['label']) or ('Intercambio' in d['label']): d['category'] = 'Ed' else: d['category'] = 'Mu' if "Antioquia" in d['location']: try: d['city'] = d['location'].split(',')[0] paisa.append(d) except: continue cal = Calendar.objects.get(shortcode='mde') for d in paisa: d['dateStamp'] =datetime.strptime(d['dateTime'][0], DATETIME_FORMAT) try: nvenue, created = Organization.objects.get_or_create( name=d['venue'], city=d['city'], website=d['venueLink'], address_complete = d['address'], is_venue=True ) except: nvenue = Organization.objects.get(name=d['venue']) nvenue.address_complete = d['address'] nvenue.save() new_event, created = Event.objects.update_or_create( event_type = d['category'], show_title = d['title'], show_link = d['link'], show_date = d['dateStamp'], show_day = d['dateStamp'], scraper = scraper, venue = nvenue ) new_event.calendar.add(cal) new_event.save() print(new_event) digitools.updateScraper(scraper, item_count_start)