Jump to content
MasterLight

.

Recommended Posts

A?a am f?cut ?i eu primul scraper, cu regex. Acu când m? uit la el, îmi vine s?-mi scot ochii :)

Folose?te un HTML parser, gen BeautifulSoup4 sau LXML. Mai sunt ?i altele. De?i unii zic c? LXML ar fi mai ”?mecher”, eu (?i din ce am v?zut mult? lume) folosesc BeautifulSoup4, e mai prietenos :)

Edit: câteva observa?ii minore legate de scriptul t?u:

from urllib.request import urlopen

d? ImportError: no module named urllib.request

De ce nu folose?ti urllib2 ?

De asemenea, urlopen().read() returneaz? string, nu mai trebui s? apelezi str(html) în regexurile tale.

Apoi, ca s? evi?i Lock-urile c? ?i a?a zici c? nu-?i merg, pune rezultatele într-un queue, în loc de list?. Queue e thread-safe, pot scrie mai multe threaduri într-un queue f?r? s? fie probleme.

Edited by fallen_angel
Link to comment
Share on other sites

Ai aici un exemplu de scraper care folose?te urllib2, multi-threading, BeautifulSoup4 ?i scrie rezultatele într-un fi?ier .xls

Salveaz? scriptul ?i f? un fi?ier hudhomestore.txt în care pune pe prima linie ”miami,fl”, f?r? ghilimele :) Sper c? mai merge...

from bs4 import BeautifulSoup
from datetime import datetime
import re
import xlwt
import urllib
import threading
import Queue
import urllib2
import time


class HudHomeStore():
"""Scraping hudhomestore.com"""

def __init__(self):
self.queue = Queue.Queue()
self.keywords = []
self.row = 0

#get the list of keywords from input file
with open('hudhomestore.txt', 'r') as inputfile:
lines = (line.rstrip() for line in inputfile)
for key in (line for line in lines if line):
self.keywords.append(key)

self.keywords = [(city, state) for (city, state) in [
key.split(',') for key in self.keywords]]

def encodeUrl(self, state, city, pageId):
data = urllib.urlencode({
'pageId': str(pageId), 'sPageSize': '10',
'zipCode': '', 'city': str(city),
'county': '', 'sState': str(state),
'fromPrice': '0', 'toPrice': '0',
'fCaseNumber': '', 'bed': '0', 'bath': '0',
'street': '', 'buyerType': '0',
'specialProgram': '', 'Status': '0',
'OrderbyName': 'SCASENUMBER',
'OrderbyValue': 'ASC', 'sLanguage': 'ENGLISH'
})
return data

def encodeString(self, string):
data = unicode(string)
return data.encode('utf8')

def getNumberOfPages(self, page_source):
"""Extract the number of result's pages from first page source
@page_source - string, HTML page source
@return - a dictionary with our datas"""

data = data.find_all('td')

try:
address = data[2].contents
address = ' '.join(address[1].strings)
except:
address = ' '
notes = data[7].find('span').get_text().strip()
start_date = data[8].find('span').get_text().strip()
bedrooms = data[5].find('label').get_text().strip()
bathrooms = data[6].find('label').get_text().strip()
starting_bid = data[3].find('span').get_text().strip()
result = {'site': 'www.hudhomestore.com',
'address': address, 'notes': notes,
'asset_type': 'NA', 'start_date': start_date,
'auction_time': 'NA', 'time_left': 'NA',
'starting_bid': starting_bid, 'current_bid': 'NA',
'final_amount': 'NA', 'case_no': 'NA',
'bedrooms': bedrooms, 'bathrooms': bathrooms,
'sq_feet': 'NA', 'end_date': 'NA',
'other_notes': 'NA'
}
return result

def writeToExcel(self):
"""Output the results into results.xls file"""

headers = ['site', 'address', 'notes', 'asset_type',
'start_date', 'auction_time', 'time_left',
'starting_bid', 'current_bid', 'final_amount',
'case_no', 'bedrooms', 'bathrooms', 'sq_feet',
'end_date', 'other_notes']
header_aliases = {'site': 'Site',
'address': 'Address City, St, County Zip',
'notes': 'Notes',
'asset_type': 'Asset Type',
'start_date': 'Start Date',
'auction_time': 'Auction Time',
'time_left': 'Time Left',
'starting_bid': 'Starting Bid',
'current_bid': 'Current Bid',
'final_amount': 'Final Judgement Amount',
'case_no': 'Case No',
'bedrooms': 'Bedrooms',
'bathrooms': 'Bathrooms',
'sq_feet': 'Sq Feet',
'end_date': 'End Date',
'other_notes': 'Other Notes'
}

workbook = xlwt.Workbook(encoding='utf8')
sheet = workbook.add_sheet("HudHomeStore")

for col, header in enumerate(headers):
sheet.write(self.row, col, header_aliases[header])
if header == 'address' or header == 'other_notes':
sheet.col(col).width = 256 * (len(header_aliases[header]) + 30)
else:
if len(header_aliases[header]) < 10:
sheet.col(col).width = 256 * (len(header_aliases[header]) + 20)
else:
sheet.col(col).width = 256 * (len(header_aliases[header]) + 1)
self.row += 1
#get the elements from the queue and save them into excel file
while not self.queue.empty():
item = self.queue.get()
for col, header in enumerate(headers):
content = item[header].encode('utf8')
sheet.write(self.row, col, content)
self.row += 1
workbook.save("hudhomestore.xls")

def worker(self, item):
"""Threads worker.
Put on the queue the dictionaries with results for items on a page"""

self.queue.put(self.parseData(item))

def run(self):
for city, state in self.keywords:
for page in self.getAllPages(state, city):
threads = []
for item in self.splitPage(page):
t = threading.Thread(target=self.worker, args=(item,))
threads.append(t)
[x.start() for x in threads]
[x.join() for x in threads]
self.writeToExcel()


def main():
runner = HudHomeStore()
runner.run()

if __name__ == '__main__':
print "Start"
start = datetime.now()
main()
print "Results were saved in hudhomestore.xls in", datetime.now() - start
time.sleep(2)

Link to comment
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.



×
×
  • Create New...