MasterLight Posted June 27, 2014 Report Posted June 27, 2014 (edited) . Edited July 5, 2016 by MasterLight Quote
fallen_angel Posted June 27, 2014 Report Posted June 27, 2014 (edited) A?a am f?cut ?i eu primul scraper, cu regex. Acu când m? uit la el, îmi vine s?-mi scot ochii Folose?te un HTML parser, gen BeautifulSoup4 sau LXML. Mai sunt ?i altele. De?i unii zic c? LXML ar fi mai ”?mecher”, eu (?i din ce am v?zut mult? lume) folosesc BeautifulSoup4, e mai prietenos Edit: câteva observa?ii minore legate de scriptul t?u:from urllib.request import urlopend? ImportError: no module named urllib.requestDe ce nu folose?ti urllib2 ?De asemenea, urlopen().read() returneaz? string, nu mai trebui s? apelezi str(html) în regexurile tale.Apoi, ca s? evi?i Lock-urile c? ?i a?a zici c? nu-?i merg, pune rezultatele într-un queue, în loc de list?. Queue e thread-safe, pot scrie mai multe threaduri într-un queue f?r? s? fie probleme. Edited June 27, 2014 by fallen_angel Quote
fallen_angel Posted June 27, 2014 Report Posted June 27, 2014 Ai aici un exemplu de scraper care folose?te urllib2, multi-threading, BeautifulSoup4 ?i scrie rezultatele într-un fi?ier .xlsSalveaz? scriptul ?i f? un fi?ier hudhomestore.txt în care pune pe prima linie ”miami,fl”, f?r? ghilimele Sper c? mai merge...from bs4 import BeautifulSoupfrom datetime import datetimeimport reimport xlwtimport urllibimport threadingimport Queueimport urllib2import timeclass HudHomeStore(): """Scraping hudhomestore.com""" def __init__(self): self.queue = Queue.Queue() self.keywords = [] self.row = 0 #get the list of keywords from input file with open('hudhomestore.txt', 'r') as inputfile: lines = (line.rstrip() for line in inputfile) for key in (line for line in lines if line): self.keywords.append(key) self.keywords = [(city, state) for (city, state) in [ key.split(',') for key in self.keywords]] def encodeUrl(self, state, city, pageId): data = urllib.urlencode({ 'pageId': str(pageId), 'sPageSize': '10', 'zipCode': '', 'city': str(city), 'county': '', 'sState': str(state), 'fromPrice': '0', 'toPrice': '0', 'fCaseNumber': '', 'bed': '0', 'bath': '0', 'street': '', 'buyerType': '0', 'specialProgram': '', 'Status': '0', 'OrderbyName': 'SCASENUMBER', 'OrderbyValue': 'ASC', 'sLanguage': 'ENGLISH' }) return data def encodeString(self, string): data = unicode(string) return data.encode('utf8') def getNumberOfPages(self, page_source): """Extract the number of result's pages from first page source @page_source - string, HTML page source @return - a dictionary with our datas""" data = data.find_all('td') try: address = data[2].contents address = ' '.join(address[1].strings) except: address = ' ' notes = data[7].find('span').get_text().strip() start_date = data[8].find('span').get_text().strip() bedrooms = data[5].find('label').get_text().strip() bathrooms = data[6].find('label').get_text().strip() starting_bid = data[3].find('span').get_text().strip() result = {'site': 'www.hudhomestore.com', 'address': address, 'notes': notes, 'asset_type': 'NA', 'start_date': start_date, 'auction_time': 'NA', 'time_left': 'NA', 'starting_bid': starting_bid, 'current_bid': 'NA', 'final_amount': 'NA', 'case_no': 'NA', 'bedrooms': bedrooms, 'bathrooms': bathrooms, 'sq_feet': 'NA', 'end_date': 'NA', 'other_notes': 'NA' } return result def writeToExcel(self): """Output the results into results.xls file""" headers = ['site', 'address', 'notes', 'asset_type', 'start_date', 'auction_time', 'time_left', 'starting_bid', 'current_bid', 'final_amount', 'case_no', 'bedrooms', 'bathrooms', 'sq_feet', 'end_date', 'other_notes'] header_aliases = {'site': 'Site', 'address': 'Address City, St, County Zip', 'notes': 'Notes', 'asset_type': 'Asset Type', 'start_date': 'Start Date', 'auction_time': 'Auction Time', 'time_left': 'Time Left', 'starting_bid': 'Starting Bid', 'current_bid': 'Current Bid', 'final_amount': 'Final Judgement Amount', 'case_no': 'Case No', 'bedrooms': 'Bedrooms', 'bathrooms': 'Bathrooms', 'sq_feet': 'Sq Feet', 'end_date': 'End Date', 'other_notes': 'Other Notes' } workbook = xlwt.Workbook(encoding='utf8') sheet = workbook.add_sheet("HudHomeStore") for col, header in enumerate(headers): sheet.write(self.row, col, header_aliases[header]) if header == 'address' or header == 'other_notes': sheet.col(col).width = 256 * (len(header_aliases[header]) + 30) else: if len(header_aliases[header]) < 10: sheet.col(col).width = 256 * (len(header_aliases[header]) + 20) else: sheet.col(col).width = 256 * (len(header_aliases[header]) + 1) self.row += 1 #get the elements from the queue and save them into excel file while not self.queue.empty(): item = self.queue.get() for col, header in enumerate(headers): content = item[header].encode('utf8') sheet.write(self.row, col, content) self.row += 1 workbook.save("hudhomestore.xls") def worker(self, item): """Threads worker. Put on the queue the dictionaries with results for items on a page""" self.queue.put(self.parseData(item)) def run(self): for city, state in self.keywords: for page in self.getAllPages(state, city): threads = [] for item in self.splitPage(page): t = threading.Thread(target=self.worker, args=(item,)) threads.append(t) [x.start() for x in threads] [x.join() for x in threads] self.writeToExcel()def main(): runner = HudHomeStore() runner.run()if __name__ == '__main__': print "Start" start = datetime.now() main() print "Results were saved in hudhomestore.xls in", datetime.now() - start time.sleep(2) Quote
MasterLight Posted June 27, 2014 Author Report Posted June 27, 2014 (edited) . Edited July 5, 2016 by MasterLight Quote