Back in sophomore year of college, one of Tara's friends, Ellora, asked if I could help her scrape a publicly accessible database because the guy who owned the database wouldn't give her a dump of the data. The database was the Centre for the Study of the Legacies of British Slavery. This was one of my first scrapers, so my apologies for using bash.
read moreShe later explained exactly what she was doing with the data (incredibly cool stuff and I'm very proud to have played a small role in this; she's now an economics professor who studies this stuff):
The research that you helped scrape data for was looking at the impact that compensation of British slaveowners in the mid-19th century had on the British economy. The work is related to the literature trying to understand Western economies' ties to and reliance on slavery for their economic development (a topic covered extensively in the recent NY Times 1619 project). The very rich data you scraped included all claims made after the British government decided to compensate former slaveowners upon emancipation in 1834. The individual claim data give the name of the claimant or former slaveowner, the location of the plantation that was the basis of their claim, the number of slaves, and the amount being claimed, etc. We are estimating the effect of the payments on different sectors in the economy that the former slaveowners were tied to or had a stake in.
getsource.sh
#!/bin/bash
echo "curl get next page:"
echo "now extracting next 50 starting at $1"
#do '>>' to append to current file
curl --data "start=$1input_surname=&input_firstname=&input_sex=&input_type=&input_keyword=&input_education=&input_occupation=&input_religion=&input_absentee=&input_birth=&input_death=&input_wealth_min=&input_wealth_max=&input_address=&input_street=&input_district=&input_city=&input_county=&input_region=&input_country=&input_claim=&input_colony=&input_estate=&input_parish=&input_collectedby=&input_category=&input_compensation_min=0&input_compensation_max=1000000000&input_enslaved_min=&input_enslaved_max=&submit=$2" http://www.ucl.ac.uk/lbs/search/ > currentRun
main.py
#!/usr/bin/python
import os
import time
import traceback
TESTING = False
time.sleep(1)
print "starting new run: {0}".format(time.asctime(time.localtime(time.time())))
currentCurlRequest=0
currentTimeElapsed=0
currentIndex=0
howHighToVary=55878 #may have missed the tail end. Stopped originally at 55878, but may have needed to go to 55994.
amountToIncreaseIndexEachTime=50
veryFirstRun = True #we need this because the first 50 have to be passed a different parameter
while currentIndex<howHighToVary:
startTime=time.time()
if veryFirstRun:
os.system("./getsource.sh {0} {1}".format(currentIndex,''))
veryFirstRun=False
else:
os.system("./getsource.sh {0} {1}".format(currentIndex,"Next+50+%E2%86%92"))
currentIndex+=amountToIncreaseIndexEachTime
endTime=time.time()
currentTimeElapsed+=(endTime-startTime)
currentCurlRequest+=1
print "-------------------------------------------"
print "Running average time per curl request: {0}".format(currentTimeElapsed/currentCurlRequest)
print "-------------------------------------------"
writeFile = open('cumulativeData','a')
line=""
dataStarted=False
with open("currentRun") as currentRun:
while '</tbody></table>' not in line:
line = currentRun.readline()
if '</tbody></table>' in line:
break
if dataStarted:
writeFile.write(line)
if '</th></tr></thead><tbody>' in line:
writeFile.write('Data for {0}'.format(currentIndex))
dataStarted = True
parsingResult.py
#!/usr/bin/python
outputCSV = open('outputCSV.csv','w')
line = "not empty init"
needToGetNextLine = True
with open("cumulativeData") as cumulativeData:
while line != "":
if needToGetNextLine is True:
line = cumulativeData.readline()
else:
needToGetNextLine = True
if '<tr><td>' in line: #starting a claim
line = cumulativeData.readline() #presumed to be the name of the claim
line = line[40:]
endOffset = line.find('</a></strong> </div>')
startOffset = line.find("\">")+2
claimName=line[startOffset:endOffset]
print claimName
#claim name done
#decided not to use regex because indexing can get the job done
line = cumulativeData.readline() #getting pound data, S, D, and enslaved count
line = line[49:]
endOffset = line.find('</strong> </div>')
ourFourValues = line[:endOffset]
claimPounds = line[:line.find(" ")]
line = line[line.find(" ")+1:]
claimS = line[:line.find("S")]
line = line[line.find("S")+2:]
claimD = line[:line.find("D")]
claimEnslaved = line[line.find("[")+1:line.find(" Enslaved")]
#print claimPounds
#print claimS
#print claimD
#print claimEnslaved
#store the data in CSV
outputCSV.write(claimName)
outputCSV.write(',')
outputCSV.write(claimPounds)
outputCSV.write(',')
outputCSV.write(claimS)
outputCSV.write(',')
outputCSV.write(claimD)
outputCSV.write(',')
outputCSV.write(claimEnslaved)
outputCSV.write(',')
line = "first pass should always go"
dictionaryOfAssociates = {}
while line != "": #take in the variable number of associations to the claim
line = cumulativeData.readline()
if "label-red" in line:
currentAssociation = line[line.find("label-red")+11:line.find("</span>")]
elif "twelve columns" in line:
line = line[34:]
currentAssociationName = line [line.find("\">")+2:line.find("</a>")]
dictionaryOfAssociates[currentAssociationName]=currentAssociation
#print currentAssociation + " " + currentAssociationName
if '<tr><td>' in line:
#print dictionaryOfAssociates
for associate in dictionaryOfAssociates:
outputCSV.write(associate)
outputCSV.write(':')
outputCSV.write(dictionaryOfAssociates[associate])
outputCSV.write(',')
outputCSV.write('\n')
needToGetNextLine = False
break