|
Monday, 14 March 2011 13:17 |
// Scrape government contract details from transparency.ca.gov
import yql
import csv
y = yql.Public()
query = 'select * from html where url = "http://www.transparency.ca.gov/Contracts/default.aspx?Page=0" and xpath = "//select[@id=\'ctl00_ContentPlaceHolder1_wcPage\']/option"';
result = y.execute(query)
page_count = result.count
contract_writer = csv.writer(open('ca_contracts.csv', 'wb'), delimiter=',', dialect='excel-tab', quoting=csv.QUOTE_ALL)
for i in range(0, page_count):
print "Parsing page #: %d" % (i)
query = 'select * from html where url = "http://www.transparency.ca.gov/Contracts/default.aspx?Page=0" and xpath = "//div[@class=\'module dynamicHide\']"';
result = y.execute(query)
for j in range(1, 51): # 50 contracts per page
# PRIMARY DATA FIELDS
number = result.rows[j]['table']['tr']['td'][0]['p'] #contract number
dept = result.rows[j]['table']['tr']['td'][1]['p'] #department
price = result.rows[j]['table']['tr']['td'][2]['p'].replace(u'\xa0', '') #price
name = result.rows[j]['div']['p'][0]['content'].strip() #supplier name
dates = result.rows[j]['div']['div'][0]['p']['content'].strip() #dates
class_codes = result.rows[j]['div']['label'][1]['content'].replace("\n", ' ') #supplier classification codes
instruct = result.rows[j]['div']['div'][1]['label']['content'].replace("\n", ' ') #special instructions
ac_type = result.rows[j]['div']['p'][1]['content'].split("\n")[0] #acquisition type
ac_method = result.rows[j]['div']['p'][1]['content'].split("\n")[1].strip() #acquisition method
# SECONDARY DATA FIELDS
category = result.rows[j]['div']['div'][2]['ul']['li']['h5']['content'] #category
descr = result.rows[j]['div']['div'][2]['ul']['li']['div'][1]['p'] #classification
contract_writer.writerow([number, dept, price, name, dates, class_codes, instruct, ac_type, ac_method, category, descr])
 Read more: |