import os, sys import json import re from datetime import datetime sys.path.append('/var/www/digisnaxx.ado/scrapers') import dtss dtss.getReady() from time import sleep from pprint import pprint as ppr import pytz from selenium import webdriver as wd from lxml import html from contracts.models import Contract, Company, Paragraph, OriginalContract print("\n+++++\n+++++\n+++++\nStarting Scrape\n+++++\n+++++\n+++++\n") tz_str = " -0600 UTC" DFORMAT = "%b %d, %Y %H:%M %p %z %Z" D2FORMAT = "%b %d, %Y %z %Z" D3FORMAT = "%b %d %Y %z %Z" SAM_BASE = "https://sam.gov" SAMLINK = "https://sam.gov/search/?index=opp&page=1&pageSize=25&sort=-modifiedDate&sfm%5Bstatus%5D%5Bis_active%5D=true&sfm%5Bstatus%5D%5Bis_inactive%5D=true&sfm%5BsimpleSearch%5D%5BkeywordRadio%5D=ALL&sfm%5BsimpleSearch%5D%5BkeywordTags%5D%5B0%5D%5Bvalue%5D=" base_site = "https://www.war.gov/News/Contracts/" br = wd.Chrome() br.get(base_site) ps = html.fromstring(br.page_source) # links = ps.xpath('.//*/p[@class="title"]/a/@href') page_links = [] for site in range(2,3): nsite = base_site + "?Page=" + str(site) br.get(nsite) sleep(2) ps = html.fromstring(br.page_source) page_links = page_links + ps.xpath('.//*/p[@class="title"]/a/@href') # for site in range(10,25): # nsite = base_site + "?Page=" + str(site) # br.get(nsite) # sleep(2) # ps = html.fromstring(br.page_source) # page_links = page_links + ps.xpath('.//*/p[@class="title"]/a/@href') data = [] base_data = [] for link in page_links: br.get(link) ps = html.fromstring(br.page_source) paras = ps.xpath('.//*/p/text()') # print(len(paras), link) for para in paras: nlink = " ".join(link.split("-")[-3:])[:-1] + tz_str new_paragraph, created = Paragraph.objects.get_or_create( link = link, paragraph=para, date = datetime.strptime(nlink, D3FORMAT), ) print(created, new_paragraph) contracts = re.findall(r'[A-Z0-9-]{12,}', para) if len(contracts) > 0: # print(contracts) for contract in contracts: i = {} i['para'] = new_paragraph.id i['contract'] = contract.replace("-", "") i['website'] = link new_contract, created = OriginalContract.objects.get_or_create( para = new_paragraph, number= i['contract'], ) print(created, new_contract) data.append(i) with open('contract_list.json', 'w') as fp: json.dump(data, fp) for d in data: website = SAMLINK + d['contract'] print("\nGetting Website: ", website) br.get(website) sleep(2) links = [] ps = html.fromstring(br.page_source) links = ps.xpath('.//*/h3/a[@class="usa-link ng-star-inserted"]/@href') for link in links: print("\n++++++++\n", d['website'], "\n++++++++++\n") br.get(SAM_BASE + link) print("\nContract link: ", SAM_BASE + link) sleep(4) ps = html.fromstring(br.page_source) i = {} i['og_contract'] = OriginalContract.objects.get(number=d['contract'], para__id=d['para']) try: i['pub_date_txt'] = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[4]/div[4]/div/h5/text()')[0].strip() except: i['pub_date_txt'] = "missed" try: i['title'] = ps.xpath('.//*/h1[@class="card-title"]/text()')[0].strip() except: i['title'] = ps.xpath('.//*/h1[@class="card-title"]/text()') try: i['notice_id'] = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[1]/div[2]/div/h5/text()')[0].strip() except: i['notice_id'] = "missed" contract, created = Contract.objects.get_or_create( title=i["title"], original_contract_number = i["og_contract"], notice_id=i["notice_id"], pub_date_txt = i["pub_date_txt"], contract_url = br.current_url ) print('CONTRACT SUCCESS', created, contract) contract.inactive_date = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[3]/div[2]/div/h5/text()')[0].strip() contract.inactive_policy = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[3]/div[4]/div/h5/text()')[0].strip() # contract.pub_date_txt = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[4]/div[4]/div/h5/text()')[0].strip() contract.resp_date = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[4]/div[2]/div/h5/text()')[0].strip() contract.us_dept = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[1]/div[1]/div[2]/h5/text()')[0].strip() contract.us_dept_subtier = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[1]/div[2]/div[2]/h5/text()')[0].strip() contract.related_notice_id = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[1]/div[4]/div/h5/text()')[0].strip() contract.opp_type = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[2]/div[2]/div/h5/text()')[0].strip() contract.orig_set = ps.xpath('.//*[@id="class"]/div[2]/div[1]/div[2]/h5/text()')[0].strip() contract.prod_svc_code = ps.xpath('.//*[@id="class"]/div[2]/div[2]/div[2]/div/h5/text()')[0].strip() contract.naics_code = ps.xpath('.//*[@id="class"]/div[2]/div[2]/div[4]/div/h5/text()')[0].strip() contract.save() try: contract.us_office = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[2]/div/div[2]/h5/text()')[0].strip() contract.save() except: contract.us_office = "naan" contract.save() try: contract.line_num = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[2]/div[4]/div/h5/text()')[0].strip() contract.save() except: continue try: contract.major_cmd = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[1]/div[3]/div[2]/h5/text()')[0].strip() contract.save() except: contract.major_cmd = "naan" contract.save() try: i['awarded_name'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[4]/div[2]/div/h5/text()')[0].strip() except: i['awarded_name'] = "naan" try: i['award_date'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[1]/div[2]/div/h5/text()')[0].strip() contract.save() except: i['award_date'] = "naan" contract.save() try: contract.award_num = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[2]/div[2]/div/h5/text()')[0].strip() contract.save() except: contract.award_num = "naan" contract.save() try: i['unq_entity_id'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[3]/div[3]/div/h5/text()')[0].strip() contract.unq_entity_id = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[3]/div[3]/div/h5/text()')[0].strip() contract.save() except: contract.unq_entity_id = "naan" contract.save() try: i['awarded_addr'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[2]/div[3]/div[2]/h5/text()')[0].strip() except: i['awarded_addr'] = "naan" try: contract.contract_value = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[3]/div[4]/div[3]/div/h5/text()')[0].strip() contract.save() except: contract.contract_value = "naan" contract.save() try: contract.description = ps.xpath('.//*[@id="desc"]/div[2]/div/div/p/text()')[0].strip() contract.save() except: contract.description = "naan" contract.save() base_data.append(i) print("\nappended info: ", i['title']) try: new_company, created = Company.objects.get_or_create( name = i["awarded_name"], address_complete=i["awarded_addr"], unq_entity_id = i["unq_entity_id"], ) print("COMPANY CREATED: ", created, new_company) contract.company = new_company contract.save() except Exception as e: print("\nCompany error: ", e) try: if contract.pub_date_txt.split(" ")[-1:][0].strip() == "EST": print("TZ: ", contract.pub_date_txt.split(" ")[-1:][0].strip()) ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0500 UTC" contract.pub_date = datetime.strptime(ndate, DFORMAT) contract.save() elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "EDT": ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0500 UTC" contract.pub_date = datetime.strptime(ndate, DFORMAT) contract.save() elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "CST": ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0600 UTC" contract.pub_date = datetime.strptime(ndate, DFORMAT) contract.save() elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "CDT": ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0600 UTC" contract.pub_date = datetime.strptime(ndate, DFORMAT) contract.save() elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "MST": ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0700 UTC" contract.pub_date = datetime.strptime(ndate, DFORMAT) contract.save() elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "PST": ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0800 UTC" contract.pub_date = datetime.strptime(ndate, DFORMAT) contract.save() except Exception as e: print("Date Error: ", e) pass # try: # contract.major_cmd = i["major_cmd"], # contract.save() # except Exception as e: # print("Major Cmd Error: ", e) # pass try: time = " -0500 UTC" contract.award_date = datetime.strptime(i["award_date"] + time, D2FORMAT) contract.save() except Exception as e: print("Award Date Error: ", e) pass # try: # contract.award_num = i["award_num"] # contract.save() # except Exception as e: # print("Award Number Error: ", e) # pass # try: # contract.contract_value = i["contract_value"] # contract.save() # except Exception as e: # print("Major Cmd Error: ", e) # pass # try: # contract.unq_entity_id = i["unq_entity_id"] # contract.save() # except Exception as e: # print("Major Cmd Error: ", e) # pass with open('final_results.json', 'w') as fp: json.dump(base_data, fp)