Files
scrapers/WarContracts/get_contracts.py

281 lines
11 KiB
Python

import os, sys
import json
import re
from datetime import datetime
sys.path.append('/var/www/digisnaxx.ado/scrapers')
import dtss
dtss.getReady()
from time import sleep
from pprint import pprint as ppr
import pytz
from selenium import webdriver as wd
from lxml import html
from contracts.models import Contract, Company, Paragraph, OriginalContract
print("\n+++++\n+++++\n+++++\nStarting Scrape\n+++++\n+++++\n+++++\n")
tz_str = " -0600 UTC"
DFORMAT = "%b %d, %Y %H:%M %p %z %Z"
D2FORMAT = "%b %d, %Y %z %Z"
D3FORMAT = "%b %d %Y %z %Z"
SAM_BASE = "https://sam.gov"
SAMLINK = "https://sam.gov/search/?index=opp&page=1&pageSize=25&sort=-modifiedDate&sfm%5Bstatus%5D%5Bis_active%5D=true&sfm%5Bstatus%5D%5Bis_inactive%5D=true&sfm%5BsimpleSearch%5D%5BkeywordRadio%5D=ALL&sfm%5BsimpleSearch%5D%5BkeywordTags%5D%5B0%5D%5Bvalue%5D="
base_site = "https://www.war.gov/News/Contracts/"
br = wd.Chrome()
br.get(base_site)
ps = html.fromstring(br.page_source)
# links = ps.xpath('.//*/p[@class="title"]/a/@href')
page_links = []
for site in range(2,3):
nsite = base_site + "?Page=" + str(site)
br.get(nsite)
sleep(2)
ps = html.fromstring(br.page_source)
page_links = page_links + ps.xpath('.//*/p[@class="title"]/a/@href')
# for site in range(10,25):
# nsite = base_site + "?Page=" + str(site)
# br.get(nsite)
# sleep(2)
# ps = html.fromstring(br.page_source)
# page_links = page_links + ps.xpath('.//*/p[@class="title"]/a/@href')
data = []
base_data = []
for link in page_links:
br.get(link)
ps = html.fromstring(br.page_source)
paras = ps.xpath('.//*/p/text()')
# print(len(paras), link)
for para in paras:
nlink = " ".join(link.split("-")[-3:])[:-1] + tz_str
new_paragraph, created = Paragraph.objects.get_or_create(
link = link,
paragraph=para,
date = datetime.strptime(nlink, D3FORMAT),
)
print(created, new_paragraph)
contracts = re.findall(r'[A-Z0-9-]{12,}', para)
if len(contracts) > 0:
# print(contracts)
for contract in contracts:
i = {}
i['para'] = new_paragraph.id
i['contract'] = contract.replace("-", "")
i['website'] = link
new_contract, created = OriginalContract.objects.get_or_create(
para = new_paragraph,
number= i['contract'],
)
print(created, new_contract)
data.append(i)
with open('contract_list.json', 'w') as fp:
json.dump(data, fp)
for d in data:
website = SAMLINK + d['contract']
print("\nGetting Website: ", website)
br.get(website)
sleep(2)
links = []
ps = html.fromstring(br.page_source)
links = ps.xpath('.//*/h3/a[@class="usa-link ng-star-inserted"]/@href')
for link in links:
print("\n++++++++\n", d['website'], "\n++++++++++\n")
br.get(SAM_BASE + link)
print("\nContract link: ", SAM_BASE + link)
sleep(4)
ps = html.fromstring(br.page_source)
i = {}
i['og_contract'] = OriginalContract.objects.get(number=d['contract'], para__id=d['para'])
try:
i['pub_date_txt'] = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[4]/div[4]/div/h5/text()')[0].strip()
except:
i['pub_date_txt'] = "missed"
try:
i['title'] = ps.xpath('.//*/h1[@class="card-title"]/text()')[0].strip()
except:
i['title'] = ps.xpath('.//*/h1[@class="card-title"]/text()')
try:
i['notice_id'] = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[1]/div[2]/div/h5/text()')[0].strip()
except:
i['notice_id'] = "missed"
contract, created = Contract.objects.get_or_create(
title=i["title"],
original_contract_number = i["og_contract"],
notice_id=i["notice_id"],
pub_date_txt = i["pub_date_txt"],
contract_url = br.current_url
)
print('CONTRACT SUCCESS', created, contract)
contract.inactive_date = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[3]/div[2]/div/h5/text()')[0].strip()
contract.inactive_policy = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[3]/div[4]/div/h5/text()')[0].strip()
# contract.pub_date_txt = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[4]/div[4]/div/h5/text()')[0].strip()
contract.resp_date = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[4]/div[2]/div/h5/text()')[0].strip()
contract.us_dept = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[1]/div[1]/div[2]/h5/text()')[0].strip()
contract.us_dept_subtier = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[1]/div[2]/div[2]/h5/text()')[0].strip()
contract.related_notice_id = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[1]/div[4]/div/h5/text()')[0].strip()
contract.opp_type = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[2]/div[2]/div/h5/text()')[0].strip()
contract.orig_set = ps.xpath('.//*[@id="class"]/div[2]/div[1]/div[2]/h5/text()')[0].strip()
contract.prod_svc_code = ps.xpath('.//*[@id="class"]/div[2]/div[2]/div[2]/div/h5/text()')[0].strip()
contract.naics_code = ps.xpath('.//*[@id="class"]/div[2]/div[2]/div[4]/div/h5/text()')[0].strip()
contract.save()
try:
contract.us_office = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[2]/div/div[2]/h5/text()')[0].strip()
contract.save()
except:
contract.us_office = "naan"
contract.save()
try:
contract.line_num = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[2]/div[4]/div/h5/text()')[0].strip()
contract.save()
except:
continue
try:
contract.major_cmd = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[1]/div[3]/div[2]/h5/text()')[0].strip()
contract.save()
except:
contract.major_cmd = "naan"
contract.save()
try:
i['awarded_name'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[4]/div[2]/div/h5/text()')[0].strip()
except:
i['awarded_name'] = "naan"
try:
i['award_date'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[1]/div[2]/div/h5/text()')[0].strip()
contract.save()
except:
i['award_date'] = "naan"
contract.save()
try:
contract.award_num = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[2]/div[2]/div/h5/text()')[0].strip()
contract.save()
except:
contract.award_num = "naan"
contract.save()
try:
i['unq_entity_id'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[3]/div[3]/div/h5/text()')[0].strip()
contract.unq_entity_id = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[3]/div[3]/div/h5/text()')[0].strip()
contract.save()
except:
contract.unq_entity_id = "naan"
contract.save()
try:
i['awarded_addr'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[2]/div[3]/div[2]/h5/text()')[0].strip()
except:
i['awarded_addr'] = "naan"
try:
contract.contract_value = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[3]/div[4]/div[3]/div/h5/text()')[0].strip()
contract.save()
except:
contract.contract_value = "naan"
contract.save()
try:
contract.description = ps.xpath('.//*[@id="desc"]/div[2]/div/div/p/text()')[0].strip()
contract.save()
except:
contract.description = "naan"
contract.save()
base_data.append(i)
print("\nappended info: ", i['title'])
try:
new_company, created = Company.objects.get_or_create(
name = i["awarded_name"],
address_complete=i["awarded_addr"],
unq_entity_id = i["unq_entity_id"],
)
print("COMPANY CREATED: ", created, new_company)
contract.company = new_company
contract.save()
except Exception as e:
print("\nCompany error: ", e)
try:
if contract.pub_date_txt.split(" ")[-1:][0].strip() == "EST":
print("TZ: ", contract.pub_date_txt.split(" ")[-1:][0].strip())
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0500 UTC"
contract.pub_date = datetime.strptime(ndate, DFORMAT)
contract.save()
elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "EDT":
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0500 UTC"
contract.pub_date = datetime.strptime(ndate, DFORMAT)
contract.save()
elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "CST":
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0600 UTC"
contract.pub_date = datetime.strptime(ndate, DFORMAT)
contract.save()
elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "CDT":
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0600 UTC"
contract.pub_date = datetime.strptime(ndate, DFORMAT)
contract.save()
elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "MST":
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0700 UTC"
contract.pub_date = datetime.strptime(ndate, DFORMAT)
contract.save()
elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "PST":
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0800 UTC"
contract.pub_date = datetime.strptime(ndate, DFORMAT)
contract.save()
except Exception as e:
print("Date Error: ", e)
pass
# try:
# contract.major_cmd = i["major_cmd"],
# contract.save()
# except Exception as e:
# print("Major Cmd Error: ", e)
# pass
try:
time = " -0500 UTC"
contract.award_date = datetime.strptime(i["award_date"] + time, D2FORMAT)
contract.save()
except Exception as e:
print("Award Date Error: ", e)
pass
# try:
# contract.award_num = i["award_num"]
# contract.save()
# except Exception as e:
# print("Award Number Error: ", e)
# pass
# try:
# contract.contract_value = i["contract_value"]
# contract.save()
# except Exception as e:
# print("Major Cmd Error: ", e)
# pass
# try:
# contract.unq_entity_id = i["unq_entity_id"]
# contract.save()
# except Exception as e:
# print("Major Cmd Error: ", e)
# pass
with open('final_results.json', 'w') as fp:
json.dump(base_data, fp)