Jump to content
Elohim

[Python] Pastebin.com eMail Spider

Recommended Posts

Posted

import re

import urllib2

import cookielib

import time

import Queue

import threading

import sys

import random

links = []

URL = "http://pastebin.com/archive"

linkfile = open("pblinks.txt","a")

emf = open('crawled.txt','a')

emlst = []

def getem(resp):

try:

emails = re.findall(r"[A-Za-z0-9%&*+?^_`{|}~-]+(?:\.[A-Za-z0-9!#$%&*+?^_`{|}~-]+)*@(?:[A-Za-z0-9](?:[a-z0-9-]*[A-Za-z0-9])?\.)+(?:[A-Za-z]{2}|com|org|net|edu|gov|mil|biz|info|mobi|name|aero|asia|jobs|museum)\b", str(resp))

for em in emails:

if em.lower() not in emlst:

emlst.append(em.lower())

emf.write(em.lower()+'\n')

emf.flush()

print em.lower()

except Exception, e:

return False

def WatchDog():

print "Fucked up, quiting"

return False

class Crawler(threading.Thread):

def __init__(self, queue):

threading.Thread.__init__(self)

self.queue = queue

def run(self):

while True:

site = self.queue.get()

self.crawl(site)

self.queue.task_done()

def crawl(self,site):

try:

time.sleep(random.randrange(1,15))

WatchIt = threading.Timer(18.0, WatchDog)

WatchIt.start()

print "Working on",site

cj = cookielib.CookieJar()

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")]

opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")]

resp = opener.open(site,timeout=15)

WatchIt.cancel()

getem(resp.read())

except Exception, e:

print e

return False

queue = Queue.Queue(maxsize=40000)

ThreadNumber = 2

for i in range(ThreadNumber):

t = Crawler(queue)

t.setDaemon(True)

t.start()

def getToken(contentHtml):

reg = re.findall('<td><img src="/i/t.gif" class="i_p0" alt="" border="0" /><a href="(.*?)">',contentHtml)

return reg

cj = cookielib.CookieJar()

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")]

opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")]

for i in range(1000):

try:

print "Getting links"

resp = opener.open(URL,timeout=15)

linklist = getToken(resp.read())

print "Got",len(linklist),"Links"

for link in linklist:

if link not in links:

queue.put("http://pastebin.com/raw.php?i="+link[1:len(link)])

#linkfile.flush()

#print link

links.append(link)

print "Sleeping"

time.sleep(90)

#print "We have",queue.qsize(),"In queue"

print "Phase",i

except Exception, e:

print e

pass

queue.join()

Urmareste toate paste-urile publice, si cauta mail-uri in ele. Ce gaseste, salveaza in crawled.txt.

Se poate adapta de cine doreste sa caute email : pass , config-uri, etc.

Asa cum e setat, nu se supara pastebin.com si nu blocheaza nimic. Daca mariti threadurile, nu garantez ca nu va blocheaza.

Posted
Post-eaza mesajul de eroare complet. Este posibil sa nu ai liburl.

[root@localhost test]# python crawler.py

File "crawler.py", line 52

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(c j))

^

SyntaxError: invalid syntax

asta e tot

Posted
[root@localhost test]# python crawler.py

File "crawler.py", line 52

opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(c j))

^

SyntaxError: invalid syntax

asta e tot

Verifica indentarile. Poti folosi pycharm pentru a face acest lucru automat.

Posted (edited)

Dupa cate vad, scriptul este scris in python 2.x, (print ""), incearca sa il rulezi cu binary-ul care trebuie, e posibil sa ai default python 3.x.

Spre exemplu, in Arch:

$ python
Python 3.4.3 (default, Feb 26 2015, 23:01:07)
[GCC 4.9.2 20150204 (prerelease)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>>

Edit: s-a uitat o virgula in cod, sau ceva

sdftjr.png

Edited by pr00f
Posted (edited)
Dupa cate vad, scriptul este scris in python 2.x, (print ""), incearca sa il rulezi cu binary-ul care trebuie, e posibil sa ai default python 3.x.

Spre exemplu, in Arch:

$ python
Python 3.4.3 (default, Feb 26 2015, 23:01:07)
[GCC 4.9.2 20150204 (prerelease)] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>>

Edit: s-a uitat o virgula in cod, sau ceva

sdftjr.png

am editat si acum imi da asa ..

Working on http://pastebin.com/raw.php?i=ZDAsw9xD

Working on http://pastebin.com/raw.php?i=icGFDC2a

sper ca e bine

Edited by Hubba

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.



×
×
  • Create New...