Ca sa functioneze, ii trebuie un fisier cu domenii, cu sau fara prefixul http:// la inceput. Usage: python emailscrapper.py Threads File Exemplu: python emalscrapper.py 50 domenii.txt Mailurile se salveaza in emails.txt UPDATE 1.3: - izolare completa la proceselor - renuntat la threading, acum functioneaza cu multiprocese total paralele - preluare rezultate mai corect - 2-3x mai rapid fata de precedenta versiune Versiune minima pentru functionare este Python 2.6 Pentru variante customizate, aveti jabberul meu in cod. """ RST eMail Crawler Version: 1.3 Author: Elohim Contact Jabber: viktor@rows.io """ import urllib2 import re import sys import cookielib from threading import Timer from multiprocessing import Process, Queue class GetResults(Process): def __init__(self, rezqueue): Process.__init__(self) self.rezqueue = rezqueue def run(self): while True: email = self.rezqueue.get() if email is None: return False with open("emails.txt","a") as EmailFile: EmailFile.write(email.rstrip()+"\n") print email class Crawler(Process): def __init__(self, queue, rezqueue): Process.__init__(self) self.queue = queue self.rezqueue = rezqueue def run(self): while True: site = self.queue.get() if site is None: return False self.crawl(site) def crawl(self,site): try: WatchIt = Timer(15.0, self.WatchDog) WatchIt.start() cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")] opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")] resp = opener.open(site,timeout=10) WatchIt.cancel() self.getem(resp.read()) except Exception, e: #print e f = 1 def getem(self,resp): try: emails = re.findall(r"[A-Za-z0-9%&*+?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&*+?^_`{|}~-]+)*@(?:[A-Za-z0-9](?:[a-z0-9-]*[A-Za-z0-9])?\.)+(?:[A-Za-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aer o|asia|jobs|museum)\b", str(resp)) CleanEmails = set(emails) for em in CleanEmails: self.rezqueue.put(em.lower()) except Exception, e: return False def WatchDog(self): return False if __name__ == "__main__": if len(sys.argv) < 3: print "Usage:",sys.argv[0],"Threads DomainFile.txt" print "\tExample: ",sys.argv[0],"30 domains.txt" sys.exit() queue = Queue(maxsize=30000) rezqueue = Queue() ThreadNumber = int(sys.argv[1]) ThreadList = [] for i in range(ThreadNumber): t = Crawler(queue,rezqueue) t.daemon = True t.start() ThreadList.append(t) GR = GetResults(rezqueue) GR.daemon = True GR.start() with open(sys.argv[2],"rU") as urls: for url in urls: try: if url.startswith('http://'): queue.put(url.rstrip()) else: url = 'http://'+url.rstrip() queue.put(url.rstrip()) except Exception, e: print e for i in range(ThreadNumber): queue.put(None) for Worker in ThreadList: Worker.join() print "All done!" rezqueue.put(None) GR.join()