Jump to content
Elohim

RST eMail Crawler v1.0

Recommended Posts

Ca sa functioneze, ii trebuie un fisier cu domenii, cu sau fara prefixul http:// la inceput.

Usage: python emailscrapper.py Threads File

Exemplu: python emalscrapper.py 50 domenii.txt

Mailurile se salveaza in emails.txt

UPDATE 1.3:

- izolare completa la proceselor

- renuntat la threading, acum functioneaza cu multiprocese total paralele

- preluare rezultate mai corect

- 2-3x mai rapid fata de precedenta versiune

Versiune minima pentru functionare este Python 2.6

Pentru variante customizate, aveti jabberul meu in cod.

"""

RST eMail Crawler

Version: 1.3

Author: Elohim

Contact Jabber: viktor@rows.io

"""

import urllib2

import re

import sys

import cookielib

from threading import Timer

from multiprocessing import Process, Queue

class GetResults(Process):

def __init__(self, rezqueue):

Process.__init__(self)

self.rezqueue = rezqueue

def run(self):

while True:

email = self.rezqueue.get()

if email is None: return False

with open("emails.txt","a") as EmailFile:

EmailFile.write(email.rstrip()+"\n")

print email

class Crawler(Process):

def __init__(self, queue, rezqueue):

Process.__init__(self)

self.queue = queue

self.rezqueue = rezqueue

def run(self):

while True:

site = self.queue.get()

if site is None: return False

self.crawl(site)

def crawl(self,site):

try:

WatchIt = Timer(15.0, self.WatchDog)

WatchIt.start()

cj = cookielib.CookieJar()

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")]

opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")]

resp = opener.open(site,timeout=10)

WatchIt.cancel()

self.getem(resp.read())

except Exception, e:

#print e

f = 1

def getem(self,resp):

try:

emails = re.findall(r"[A-Za-z0-9%&*+?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&*+?^_`{|}~-]+)*@(?:[A-Za-z0-9](?:[a-z0-9-]*[A-Za-z0-9])?\.)+(?:[A-Za-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aer o|asia|jobs|museum)\b", str(resp))

CleanEmails = set(emails)

for em in CleanEmails:

self.rezqueue.put(em.lower())

except Exception, e:

return False

def WatchDog(self):

return False

if __name__ == "__main__":

if len(sys.argv) < 3:

print "Usage:",sys.argv[0],"Threads DomainFile.txt"

print "\tExample: ",sys.argv[0],"30 domains.txt"

sys.exit()

queue = Queue(maxsize=30000)

rezqueue = Queue()

ThreadNumber = int(sys.argv[1])

ThreadList = []

for i in range(ThreadNumber):

t = Crawler(queue,rezqueue)

t.daemon = True

t.start()

ThreadList.append(t)

GR = GetResults(rezqueue)

GR.daemon = True

GR.start()

with open(sys.argv[2],"rU") as urls:

for url in urls:

try:

if url.startswith('http://'):

queue.put(url.rstrip())

else:

url = 'http://'+url.rstrip()

queue.put(url.rstrip())

except Exception, e:

print e

for i in range(ThreadNumber):

queue.put(None)

for Worker in ThreadList:

Worker.join()

print "All done!"

rezqueue.put(None)

GR.join()

Edited by Elohim
  • Upvote 3
Link to comment
Share on other sites

de ce ?

 File "/usr/lib/python2.6/threading.py", line 737, in run
File "/usr/lib/python2.6/threading.py", line 380, in set
File "/usr/lib/python2.6/threading.py", line 291, in notifyAll
<type 'exceptions.TypeError'>: 'NoneType' object is not callable
<type 'exceptions.TypeError'>: 'NoneType' object is not callable

Link to comment
Share on other sites

de ce ?

 File "/usr/lib/python2.6/threading.py", line 737, in run
File "/usr/lib/python2.6/threading.py", line 380, in set
File "/usr/lib/python2.6/threading.py", line 291, in notifyAll
<type 'exceptions.TypeError'>: 'NoneType' object is not callable
<type 'exceptions.TypeError'>: 'NoneType' object is not callable

python2.6 fol. 2.7 versiunea de python si nu cred ca o sa mai ai probleme.

Link to comment
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.



×
×
  • Create New...