Elohim Posted January 6, 2015 Report Posted January 6, 2015 import reimport urllib2import cookielibimport timeimport Queueimport threadingimport sysimport randomlinks = []URL = "http://pastebin.com/archive"linkfile = open("pblinks.txt","a")emf = open('crawled.txt','a')emlst = []def getem(resp): try: emails = re.findall(r"[A-Za-z0-9%&*+?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&*+?^_`{|}~-]+)*@(?:[A-Za-z0-9](?:[a-z0-9-]*[A-Za-z0-9])?\.)+(?:[A-Za-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|asia|jobs|museum)\b", str(resp)) for em in emails: if em.lower() not in emlst: emlst.append(em.lower()) emf.write(em.lower()+'\n') emf.flush() print em.lower() except Exception, e: return Falsedef WatchDog(): print "Fucked up, quiting" return Falseclass Crawler(threading.Thread): def __init__(self, queue): threading.Thread.__init__(self) self.queue = queue def run(self): while True: site = self.queue.get() self.crawl(site) self.queue.task_done() def crawl(self,site): try: time.sleep(random.randrange(1,15)) WatchIt = threading.Timer(18.0, WatchDog) WatchIt.start() print "Working on",site cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")] opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")] resp = opener.open(site,timeout=15) WatchIt.cancel() getem(resp.read()) except Exception, e: print e return Falsequeue = Queue.Queue(maxsize=40000) ThreadNumber = 2for i in range(ThreadNumber): t = Crawler(queue) t.setDaemon(True) t.start()def getToken(contentHtml): reg = re.findall('<td><img src="/i/t.gif" class="i_p0" alt="" border="0" /><a href="(.*?)">',contentHtml) return regcj = cookielib.CookieJar()opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")]opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")]for i in range(1000): try: print "Getting links" resp = opener.open(URL,timeout=15) linklist = getToken(resp.read()) print "Got",len(linklist),"Links" for link in linklist: if link not in links: queue.put("http://pastebin.com/raw.php?i="+link[1:len(link)]) #linkfile.flush() #print link links.append(link) print "Sleeping" time.sleep(90) #print "We have",queue.qsize(),"In queue" print "Phase",i except Exception, e: print e passqueue.join()Urmareste toate paste-urile publice, si cauta mail-uri in ele. Ce gaseste, salveaza in crawled.txt.Se poate adapta de cine doreste sa caute email : pass , config-uri, etc.Asa cum e setat, nu se supara pastebin.com si nu blocheaza nimic. Daca mariti threadurile, nu garantez ca nu va blocheaza. Quote
Hubba Posted March 5, 2015 Report Posted March 5, 2015 de ce imi da asa ? File "rst.py", line 70 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(c j)) ^ Quote
Ganav Posted March 5, 2015 Report Posted March 5, 2015 de ce imi da asa ?Post-eaza mesajul de eroare complet. Este posibil sa nu ai liburl. Quote
Hubba Posted March 5, 2015 Report Posted March 5, 2015 Post-eaza mesajul de eroare complet. Este posibil sa nu ai liburl.[root@localhost test]# python crawler.py File "crawler.py", line 52 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(c j)) ^SyntaxError: invalid syntax asta e tot Quote
Ganav Posted March 5, 2015 Report Posted March 5, 2015 [root@localhost test]# python crawler.py File "crawler.py", line 52 opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(c j)) ^SyntaxError: invalid syntax asta e totVerifica indentarile. Poti folosi pycharm pentru a face acest lucru automat. Quote
pr00f Posted March 5, 2015 Report Posted March 5, 2015 (edited) Dupa cate vad, scriptul este scris in python 2.x, (print ""), incearca sa il rulezi cu binary-ul care trebuie, e posibil sa ai default python 3.x.Spre exemplu, in Arch:$ pythonPython 3.4.3 (default, Feb 26 2015, 23:01:07) [GCC 4.9.2 20150204 (prerelease)] on linuxType "help", "copyright", "credits" or "license" for more information.>>>Edit: s-a uitat o virgula in cod, sau ceva Edited March 5, 2015 by pr00f Quote
Twiff Posted March 5, 2015 Report Posted March 5, 2015 Sterge spatiile dintre c si j si o sa iti ruleze. Quote
Hubba Posted March 5, 2015 Report Posted March 5, 2015 (edited) Dupa cate vad, scriptul este scris in python 2.x, (print ""), incearca sa il rulezi cu binary-ul care trebuie, e posibil sa ai default python 3.x.Spre exemplu, in Arch:$ pythonPython 3.4.3 (default, Feb 26 2015, 23:01:07) [GCC 4.9.2 20150204 (prerelease)] on linuxType "help", "copyright", "credits" or "license" for more information.>>>Edit: s-a uitat o virgula in cod, sau ceva am editat si acum imi da asa ..Working on http://pastebin.com/raw.php?i=ZDAsw9xDWorking on http://pastebin.com/raw.php?i=icGFDC2asper ca e bine Edited March 5, 2015 by Hubba Quote