Pages

Apr 22, 2015

[Python] An example of scraping date from a website and writing to a csv file

In order to scrape data from a website, I used the "BeautifulSoup" module for Python. The data I want to get from the website (http://www.charitynavigator.org/index.cfm?bay=topten.detail&listid=24#.VTfpxa3BzGc) is the "10 Super-Sized Charities."



A sample code is shown below:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
def outputCSV(dataSet, filename):
print(dataSet)
with open(filename, 'w', newline='') as csvfile:
csvW = csv.writer(csvfile)
csvW.writerows(dataSet)
csvfile.close()
def main():
html = urlopen('http://www.charitynavigator.org/index.cfm?bay=topten.detail&listid=24#.VTfpxa3BzGc')
soup = BeautifulSoup(html.read())
dataSet = []
data = []
#get column name
for row in soup('table')[0].findAll('tr')[0].findAll('th'):
data.append(row.contents[0])
dataSet.append(data)
#get data
for i in range(1, len(soup('table')[0].findAll('tr'))):
data = []
for row in soup('table')[0].findAll('tr')[i].findAll('td'):
if row.a == None:
tmpString = row.contents
else:
#remove hyperlink
tmpString = row.a.contents
if tmpString != []:
data.append(tmpString[0].strip())
if data != []:
#print(data)
dataSet.append(data)
for row in dataSet:
print(row)
outputCSV(dataSet, 'web_scrape.csv')
if __name__ == '__main__':
main()
view raw scrape.py hosted with ❤ by GitHub
And the result is

No comments:

Post a Comment