Jump to content

Leaderboard

Popular Content

Showing content with the highest reputation on 02/20/14 in all areas

  1. Ca sa functioneze, ii trebuie un fisier cu domenii, cu sau fara prefixul http:// la inceput. Usage: python emailscrapper.py Threads File Exemplu: python emalscrapper.py 50 domenii.txt Mailurile se salveaza in emails.txt UPDATE 1.3: - izolare completa la proceselor - renuntat la threading, acum functioneaza cu multiprocese total paralele - preluare rezultate mai corect - 2-3x mai rapid fata de precedenta versiune Versiune minima pentru functionare este Python 2.6 Pentru variante customizate, aveti jabberul meu in cod. """ RST eMail Crawler Version: 1.3 Author: Elohim Contact Jabber: viktor@rows.io """ import urllib2 import re import sys import cookielib from threading import Timer from multiprocessing import Process, Queue class GetResults(Process): def __init__(self, rezqueue): Process.__init__(self) self.rezqueue = rezqueue def run(self): while True: email = self.rezqueue.get() if email is None: return False with open("emails.txt","a") as EmailFile: EmailFile.write(email.rstrip()+"\n") print email class Crawler(Process): def __init__(self, queue, rezqueue): Process.__init__(self) self.queue = queue self.rezqueue = rezqueue def run(self): while True: site = self.queue.get() if site is None: return False self.crawl(site) def crawl(self,site): try: WatchIt = Timer(15.0, self.WatchDog) WatchIt.start() cj = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")] opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")] resp = opener.open(site,timeout=10) WatchIt.cancel() self.getem(resp.read()) except Exception, e: #print e f = 1 def getem(self,resp): try: emails = re.findall(r"[A-Za-z0-9%&*+?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&*+?^_`{|}~-]+)*@(?:[A-Za-z0-9](?:[a-z0-9-]*[A-Za-z0-9])?\.)+(?:[A-Za-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aer o|asia|jobs|museum)\b", str(resp)) CleanEmails = set(emails) for em in CleanEmails: self.rezqueue.put(em.lower()) except Exception, e: return False def WatchDog(self): return False if __name__ == "__main__": if len(sys.argv) < 3: print "Usage:",sys.argv[0],"Threads DomainFile.txt" print "\tExample: ",sys.argv[0],"30 domains.txt" sys.exit() queue = Queue(maxsize=30000) rezqueue = Queue() ThreadNumber = int(sys.argv[1]) ThreadList = [] for i in range(ThreadNumber): t = Crawler(queue,rezqueue) t.daemon = True t.start() ThreadList.append(t) GR = GetResults(rezqueue) GR.daemon = True GR.start() with open(sys.argv[2],"rU") as urls: for url in urls: try: if url.startswith('http://'): queue.put(url.rstrip()) else: url = 'http://'+url.rstrip() queue.put(url.rstrip()) except Exception, e: print e for i in range(ThreadNumber): queue.put(None) for Worker in ThreadList: Worker.join() print "All done!" rezqueue.put(None) GR.join()
    1 point
  2. Pune te rog la categoria non it. ti-am mutat ambele threaduri (si cel cu pr0n keywords si asta). De asemenea, nu mai da paste pe rst la 3000 porn keywords, le pui pe pastebin mai bine. Atrag vizite aiurea si dau rau in google pentru ca se face asocierea intre domeniu/site si cuvintele acelea. merci
    1 point
This leaderboard is set to Bucharest/GMT+02:00
×
×
  • Create New...