281 lines
11 KiB
Python
281 lines
11 KiB
Python
|
|
import os, sys
|
||
|
|
import json
|
||
|
|
import re
|
||
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
sys.path.append('/var/www/digisnaxx.ado/scrapers')
|
||
|
|
import dtss
|
||
|
|
dtss.getReady()
|
||
|
|
|
||
|
|
from time import sleep
|
||
|
|
from pprint import pprint as ppr
|
||
|
|
import pytz
|
||
|
|
|
||
|
|
from selenium import webdriver as wd
|
||
|
|
from lxml import html
|
||
|
|
|
||
|
|
from contracts.models import Contract, Company, Paragraph, OriginalContract
|
||
|
|
|
||
|
|
|
||
|
|
print("\n+++++\n+++++\n+++++\nStarting Scrape\n+++++\n+++++\n+++++\n")
|
||
|
|
|
||
|
|
tz_str = " -0600 UTC"
|
||
|
|
DFORMAT = "%b %d, %Y %H:%M %p %z %Z"
|
||
|
|
D2FORMAT = "%b %d, %Y %z %Z"
|
||
|
|
D3FORMAT = "%b %d %Y %z %Z"
|
||
|
|
|
||
|
|
SAM_BASE = "https://sam.gov"
|
||
|
|
SAMLINK = "https://sam.gov/search/?index=opp&page=1&pageSize=25&sort=-modifiedDate&sfm%5Bstatus%5D%5Bis_active%5D=true&sfm%5Bstatus%5D%5Bis_inactive%5D=true&sfm%5BsimpleSearch%5D%5BkeywordRadio%5D=ALL&sfm%5BsimpleSearch%5D%5BkeywordTags%5D%5B0%5D%5Bvalue%5D="
|
||
|
|
|
||
|
|
|
||
|
|
base_site = "https://www.war.gov/News/Contracts/"
|
||
|
|
|
||
|
|
|
||
|
|
br = wd.Chrome()
|
||
|
|
br.get(base_site)
|
||
|
|
ps = html.fromstring(br.page_source)
|
||
|
|
# links = ps.xpath('.//*/p[@class="title"]/a/@href')
|
||
|
|
page_links = []
|
||
|
|
|
||
|
|
for site in range(2,3):
|
||
|
|
nsite = base_site + "?Page=" + str(site)
|
||
|
|
br.get(nsite)
|
||
|
|
sleep(2)
|
||
|
|
ps = html.fromstring(br.page_source)
|
||
|
|
page_links = page_links + ps.xpath('.//*/p[@class="title"]/a/@href')
|
||
|
|
|
||
|
|
|
||
|
|
# for site in range(10,25):
|
||
|
|
# nsite = base_site + "?Page=" + str(site)
|
||
|
|
# br.get(nsite)
|
||
|
|
# sleep(2)
|
||
|
|
# ps = html.fromstring(br.page_source)
|
||
|
|
# page_links = page_links + ps.xpath('.//*/p[@class="title"]/a/@href')
|
||
|
|
|
||
|
|
|
||
|
|
data = []
|
||
|
|
base_data = []
|
||
|
|
|
||
|
|
for link in page_links:
|
||
|
|
br.get(link)
|
||
|
|
ps = html.fromstring(br.page_source)
|
||
|
|
paras = ps.xpath('.//*/p/text()')
|
||
|
|
# print(len(paras), link)
|
||
|
|
for para in paras:
|
||
|
|
nlink = " ".join(link.split("-")[-3:])[:-1] + tz_str
|
||
|
|
new_paragraph, created = Paragraph.objects.get_or_create(
|
||
|
|
link = link,
|
||
|
|
paragraph=para,
|
||
|
|
date = datetime.strptime(nlink, D3FORMAT),
|
||
|
|
)
|
||
|
|
print(created, new_paragraph)
|
||
|
|
|
||
|
|
contracts = re.findall(r'[A-Z0-9-]{12,}', para)
|
||
|
|
if len(contracts) > 0:
|
||
|
|
# print(contracts)
|
||
|
|
for contract in contracts:
|
||
|
|
i = {}
|
||
|
|
i['para'] = new_paragraph.id
|
||
|
|
i['contract'] = contract.replace("-", "")
|
||
|
|
i['website'] = link
|
||
|
|
new_contract, created = OriginalContract.objects.get_or_create(
|
||
|
|
para = new_paragraph,
|
||
|
|
number= i['contract'],
|
||
|
|
)
|
||
|
|
print(created, new_contract)
|
||
|
|
data.append(i)
|
||
|
|
|
||
|
|
|
||
|
|
with open('contract_list.json', 'w') as fp:
|
||
|
|
json.dump(data, fp)
|
||
|
|
|
||
|
|
|
||
|
|
for d in data:
|
||
|
|
website = SAMLINK + d['contract']
|
||
|
|
print("\nGetting Website: ", website)
|
||
|
|
br.get(website)
|
||
|
|
sleep(2)
|
||
|
|
links = []
|
||
|
|
ps = html.fromstring(br.page_source)
|
||
|
|
links = ps.xpath('.//*/h3/a[@class="usa-link ng-star-inserted"]/@href')
|
||
|
|
for link in links:
|
||
|
|
print("\n++++++++\n", d['website'], "\n++++++++++\n")
|
||
|
|
br.get(SAM_BASE + link)
|
||
|
|
print("\nContract link: ", SAM_BASE + link)
|
||
|
|
sleep(4)
|
||
|
|
ps = html.fromstring(br.page_source)
|
||
|
|
i = {}
|
||
|
|
|
||
|
|
|
||
|
|
i['og_contract'] = OriginalContract.objects.get(number=d['contract'], para__id=d['para'])
|
||
|
|
try:
|
||
|
|
i['pub_date_txt'] = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[4]/div[4]/div/h5/text()')[0].strip()
|
||
|
|
except:
|
||
|
|
i['pub_date_txt'] = "missed"
|
||
|
|
try:
|
||
|
|
i['title'] = ps.xpath('.//*/h1[@class="card-title"]/text()')[0].strip()
|
||
|
|
except:
|
||
|
|
i['title'] = ps.xpath('.//*/h1[@class="card-title"]/text()')
|
||
|
|
try:
|
||
|
|
i['notice_id'] = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[1]/div[2]/div/h5/text()')[0].strip()
|
||
|
|
except:
|
||
|
|
i['notice_id'] = "missed"
|
||
|
|
|
||
|
|
contract, created = Contract.objects.get_or_create(
|
||
|
|
title=i["title"],
|
||
|
|
original_contract_number = i["og_contract"],
|
||
|
|
notice_id=i["notice_id"],
|
||
|
|
pub_date_txt = i["pub_date_txt"],
|
||
|
|
contract_url = br.current_url
|
||
|
|
)
|
||
|
|
print('CONTRACT SUCCESS', created, contract)
|
||
|
|
|
||
|
|
contract.inactive_date = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[3]/div[2]/div/h5/text()')[0].strip()
|
||
|
|
contract.inactive_policy = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[3]/div[4]/div/h5/text()')[0].strip()
|
||
|
|
# contract.pub_date_txt = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[4]/div[4]/div/h5/text()')[0].strip()
|
||
|
|
contract.resp_date = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[4]/div[2]/div/h5/text()')[0].strip()
|
||
|
|
contract.us_dept = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[1]/div[1]/div[2]/h5/text()')[0].strip()
|
||
|
|
contract.us_dept_subtier = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[1]/div[2]/div[2]/h5/text()')[0].strip()
|
||
|
|
contract.related_notice_id = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[1]/div[4]/div/h5/text()')[0].strip()
|
||
|
|
contract.opp_type = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[2]/div[2]/div/h5/text()')[0].strip()
|
||
|
|
contract.orig_set = ps.xpath('.//*[@id="class"]/div[2]/div[1]/div[2]/h5/text()')[0].strip()
|
||
|
|
contract.prod_svc_code = ps.xpath('.//*[@id="class"]/div[2]/div[2]/div[2]/div/h5/text()')[0].strip()
|
||
|
|
contract.naics_code = ps.xpath('.//*[@id="class"]/div[2]/div[2]/div[4]/div/h5/text()')[0].strip()
|
||
|
|
contract.save()
|
||
|
|
|
||
|
|
|
||
|
|
try:
|
||
|
|
contract.us_office = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[2]/div/div[2]/h5/text()')[0].strip()
|
||
|
|
contract.save()
|
||
|
|
except:
|
||
|
|
contract.us_office = "naan"
|
||
|
|
contract.save()
|
||
|
|
try:
|
||
|
|
contract.line_num = ps.xpath('.//*[@id="solicitation"]/div/div/div[3]/div[2]/div[4]/div/h5/text()')[0].strip()
|
||
|
|
contract.save()
|
||
|
|
except:
|
||
|
|
continue
|
||
|
|
try:
|
||
|
|
contract.major_cmd = ps.xpath('.//*[@id="solicitation"]/div/div/div[4]/div/div[1]/div[3]/div[2]/h5/text()')[0].strip()
|
||
|
|
contract.save()
|
||
|
|
except:
|
||
|
|
contract.major_cmd = "naan"
|
||
|
|
contract.save()
|
||
|
|
try:
|
||
|
|
i['awarded_name'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[4]/div[2]/div/h5/text()')[0].strip()
|
||
|
|
except:
|
||
|
|
i['awarded_name'] = "naan"
|
||
|
|
try:
|
||
|
|
i['award_date'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[1]/div[2]/div/h5/text()')[0].strip()
|
||
|
|
contract.save()
|
||
|
|
except:
|
||
|
|
i['award_date'] = "naan"
|
||
|
|
contract.save()
|
||
|
|
try:
|
||
|
|
contract.award_num = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[2]/div[2]/div/h5/text()')[0].strip()
|
||
|
|
contract.save()
|
||
|
|
except:
|
||
|
|
contract.award_num = "naan"
|
||
|
|
contract.save()
|
||
|
|
try:
|
||
|
|
i['unq_entity_id'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[3]/div[3]/div/h5/text()')[0].strip()
|
||
|
|
contract.unq_entity_id = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[3]/div[3]/div/h5/text()')[0].strip()
|
||
|
|
contract.save()
|
||
|
|
except:
|
||
|
|
contract.unq_entity_id = "naan"
|
||
|
|
contract.save()
|
||
|
|
try:
|
||
|
|
i['awarded_addr'] = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[2]/div[3]/div[2]/h5/text()')[0].strip()
|
||
|
|
except:
|
||
|
|
i['awarded_addr'] = "naan"
|
||
|
|
try:
|
||
|
|
contract.contract_value = ps.xpath('.//*[@id="award-details"]/div[2]/div/div[3]/div[4]/div[3]/div/h5/text()')[0].strip()
|
||
|
|
contract.save()
|
||
|
|
except:
|
||
|
|
contract.contract_value = "naan"
|
||
|
|
contract.save()
|
||
|
|
try:
|
||
|
|
contract.description = ps.xpath('.//*[@id="desc"]/div[2]/div/div/p/text()')[0].strip()
|
||
|
|
contract.save()
|
||
|
|
except:
|
||
|
|
contract.description = "naan"
|
||
|
|
contract.save()
|
||
|
|
|
||
|
|
base_data.append(i)
|
||
|
|
|
||
|
|
print("\nappended info: ", i['title'])
|
||
|
|
try:
|
||
|
|
new_company, created = Company.objects.get_or_create(
|
||
|
|
name = i["awarded_name"],
|
||
|
|
address_complete=i["awarded_addr"],
|
||
|
|
unq_entity_id = i["unq_entity_id"],
|
||
|
|
)
|
||
|
|
print("COMPANY CREATED: ", created, new_company)
|
||
|
|
contract.company = new_company
|
||
|
|
contract.save()
|
||
|
|
except Exception as e:
|
||
|
|
print("\nCompany error: ", e)
|
||
|
|
try:
|
||
|
|
if contract.pub_date_txt.split(" ")[-1:][0].strip() == "EST":
|
||
|
|
print("TZ: ", contract.pub_date_txt.split(" ")[-1:][0].strip())
|
||
|
|
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0500 UTC"
|
||
|
|
contract.pub_date = datetime.strptime(ndate, DFORMAT)
|
||
|
|
contract.save()
|
||
|
|
elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "EDT":
|
||
|
|
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0500 UTC"
|
||
|
|
contract.pub_date = datetime.strptime(ndate, DFORMAT)
|
||
|
|
contract.save()
|
||
|
|
elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "CST":
|
||
|
|
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0600 UTC"
|
||
|
|
contract.pub_date = datetime.strptime(ndate, DFORMAT)
|
||
|
|
contract.save()
|
||
|
|
elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "CDT":
|
||
|
|
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0600 UTC"
|
||
|
|
contract.pub_date = datetime.strptime(ndate, DFORMAT)
|
||
|
|
contract.save()
|
||
|
|
elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "MST":
|
||
|
|
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0700 UTC"
|
||
|
|
contract.pub_date = datetime.strptime(ndate, DFORMAT)
|
||
|
|
contract.save()
|
||
|
|
elif contract.pub_date_txt.split(" ")[-1:][0].strip() == "PST":
|
||
|
|
ndate = " ".join(contract.pub_date_txt.split(" ")[:-1]) + " -0800 UTC"
|
||
|
|
contract.pub_date = datetime.strptime(ndate, DFORMAT)
|
||
|
|
contract.save()
|
||
|
|
except Exception as e:
|
||
|
|
print("Date Error: ", e)
|
||
|
|
pass
|
||
|
|
# try:
|
||
|
|
# contract.major_cmd = i["major_cmd"],
|
||
|
|
# contract.save()
|
||
|
|
# except Exception as e:
|
||
|
|
# print("Major Cmd Error: ", e)
|
||
|
|
# pass
|
||
|
|
try:
|
||
|
|
time = " -0500 UTC"
|
||
|
|
contract.award_date = datetime.strptime(i["award_date"] + time, D2FORMAT)
|
||
|
|
contract.save()
|
||
|
|
except Exception as e:
|
||
|
|
print("Award Date Error: ", e)
|
||
|
|
pass
|
||
|
|
# try:
|
||
|
|
# contract.award_num = i["award_num"]
|
||
|
|
# contract.save()
|
||
|
|
# except Exception as e:
|
||
|
|
# print("Award Number Error: ", e)
|
||
|
|
# pass
|
||
|
|
# try:
|
||
|
|
# contract.contract_value = i["contract_value"]
|
||
|
|
# contract.save()
|
||
|
|
# except Exception as e:
|
||
|
|
# print("Major Cmd Error: ", e)
|
||
|
|
# pass
|
||
|
|
# try:
|
||
|
|
# contract.unq_entity_id = i["unq_entity_id"]
|
||
|
|
# contract.save()
|
||
|
|
# except Exception as e:
|
||
|
|
# print("Major Cmd Error: ", e)
|
||
|
|
# pass
|
||
|
|
|
||
|
|
|
||
|
|
with open('final_results.json', 'w') as fp:
|
||
|
|
json.dump(base_data, fp)
|