Jump to content
MasterLight

.

Recommended Posts

Posted (edited)

A?a am f?cut ?i eu primul scraper, cu regex. Acu când m? uit la el, îmi vine s?-mi scot ochii :)

Folose?te un HTML parser, gen BeautifulSoup4 sau LXML. Mai sunt ?i altele. De?i unii zic c? LXML ar fi mai ”?mecher”, eu (?i din ce am v?zut mult? lume) folosesc BeautifulSoup4, e mai prietenos :)

Edit: câteva observa?ii minore legate de scriptul t?u:

from urllib.request import urlopen

d? ImportError: no module named urllib.request

De ce nu folose?ti urllib2 ?

De asemenea, urlopen().read() returneaz? string, nu mai trebui s? apelezi str(html) în regexurile tale.

Apoi, ca s? evi?i Lock-urile c? ?i a?a zici c? nu-?i merg, pune rezultatele într-un queue, în loc de list?. Queue e thread-safe, pot scrie mai multe threaduri într-un queue f?r? s? fie probleme.

Edited by fallen_angel
Posted

Ai aici un exemplu de scraper care folose?te urllib2, multi-threading, BeautifulSoup4 ?i scrie rezultatele într-un fi?ier .xls

Salveaz? scriptul ?i f? un fi?ier hudhomestore.txt în care pune pe prima linie ”miami,fl”, f?r? ghilimele :) Sper c? mai merge...

from bs4 import BeautifulSoup
from datetime import datetime
import re
import xlwt
import urllib
import threading
import Queue
import urllib2
import time


class HudHomeStore():
"""Scraping hudhomestore.com"""

def __init__(self):
self.queue = Queue.Queue()
self.keywords = []
self.row = 0

#get the list of keywords from input file
with open('hudhomestore.txt', 'r') as inputfile:
lines = (line.rstrip() for line in inputfile)
for key in (line for line in lines if line):
self.keywords.append(key)

self.keywords = [(city, state) for (city, state) in [
key.split(',') for key in self.keywords]]

def encodeUrl(self, state, city, pageId):
data = urllib.urlencode({
'pageId': str(pageId), 'sPageSize': '10',
'zipCode': '', 'city': str(city),
'county': '', 'sState': str(state),
'fromPrice': '0', 'toPrice': '0',
'fCaseNumber': '', 'bed': '0', 'bath': '0',
'street': '', 'buyerType': '0',
'specialProgram': '', 'Status': '0',
'OrderbyName': 'SCASENUMBER',
'OrderbyValue': 'ASC', 'sLanguage': 'ENGLISH'
})
return data

def encodeString(self, string):
data = unicode(string)
return data.encode('utf8')

def getNumberOfPages(self, page_source):
"""Extract the number of result's pages from first page source
@page_source - string, HTML page source
@return - a dictionary with our datas"""

data = data.find_all('td')

try:
address = data[2].contents
address = ' '.join(address[1].strings)
except:
address = ' '
notes = data[7].find('span').get_text().strip()
start_date = data[8].find('span').get_text().strip()
bedrooms = data[5].find('label').get_text().strip()
bathrooms = data[6].find('label').get_text().strip()
starting_bid = data[3].find('span').get_text().strip()
result = {'site': 'www.hudhomestore.com',
'address': address, 'notes': notes,
'asset_type': 'NA', 'start_date': start_date,
'auction_time': 'NA', 'time_left': 'NA',
'starting_bid': starting_bid, 'current_bid': 'NA',
'final_amount': 'NA', 'case_no': 'NA',
'bedrooms': bedrooms, 'bathrooms': bathrooms,
'sq_feet': 'NA', 'end_date': 'NA',
'other_notes': 'NA'
}
return result

def writeToExcel(self):
"""Output the results into results.xls file"""

headers = ['site', 'address', 'notes', 'asset_type',
'start_date', 'auction_time', 'time_left',
'starting_bid', 'current_bid', 'final_amount',
'case_no', 'bedrooms', 'bathrooms', 'sq_feet',
'end_date', 'other_notes']
header_aliases = {'site': 'Site',
'address': 'Address City, St, County Zip',
'notes': 'Notes',
'asset_type': 'Asset Type',
'start_date': 'Start Date',
'auction_time': 'Auction Time',
'time_left': 'Time Left',
'starting_bid': 'Starting Bid',
'current_bid': 'Current Bid',
'final_amount': 'Final Judgement Amount',
'case_no': 'Case No',
'bedrooms': 'Bedrooms',
'bathrooms': 'Bathrooms',
'sq_feet': 'Sq Feet',
'end_date': 'End Date',
'other_notes': 'Other Notes'
}

workbook = xlwt.Workbook(encoding='utf8')
sheet = workbook.add_sheet("HudHomeStore")

for col, header in enumerate(headers):
sheet.write(self.row, col, header_aliases[header])
if header == 'address' or header == 'other_notes':
sheet.col(col).width = 256 * (len(header_aliases[header]) + 30)
else:
if len(header_aliases[header]) < 10:
sheet.col(col).width = 256 * (len(header_aliases[header]) + 20)
else:
sheet.col(col).width = 256 * (len(header_aliases[header]) + 1)
self.row += 1
#get the elements from the queue and save them into excel file
while not self.queue.empty():
item = self.queue.get()
for col, header in enumerate(headers):
content = item[header].encode('utf8')
sheet.write(self.row, col, content)
self.row += 1
workbook.save("hudhomestore.xls")

def worker(self, item):
"""Threads worker.
Put on the queue the dictionaries with results for items on a page"""

self.queue.put(self.parseData(item))

def run(self):
for city, state in self.keywords:
for page in self.getAllPages(state, city):
threads = []
for item in self.splitPage(page):
t = threading.Thread(target=self.worker, args=(item,))
threads.append(t)
[x.start() for x in threads]
[x.join() for x in threads]
self.writeToExcel()


def main():
runner = HudHomeStore()
runner.run()

if __name__ == '__main__':
print "Start"
start = datetime.now()
main()
print "Results were saved in hudhomestore.xls in", datetime.now() - start
time.sleep(2)

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.



×
×
  • Create New...