wospi - world wide web word crawler for generating wordlists

Am avut nevoie de un crawler care genereaza wordlist-uri in functie de continutul unei pagini/unui website, si tot ce am gasit a fost cewl, dar e scris in ruby si nu-mi place ideea de gem-uri (sunt fan dulceata), asa ca am decis sa scriu eu unul. Poate nu e la fel de bun/featureful precum cewl (sau altele?), dar functioneaza, si functioneaza bine.


Necesita requests si BeautifulSoup.

Sursa, cat si informatii despre setting up: https://github.com/vlad-s/wospi

wospi 0.1

word spider whose sole purpose is to crawl for strings
and generate a wordlist

__author__ = "Vlad <vlad at vlads dot me>"
__version__ = "0.1"
__license__ = "GPL v2"

# pylint: disable=import-error
# pylint can't find BeautifulSoup (installed with pip)

import requests
import argparse
from threading import Thread
from bs4 import BeautifulSoup

class WordSpider(object):
""" Main class """

def __init__(self, output, url):
self.min_length = 4
self.user_agent = "wospi (v0.1) word spiderbro"
self.with_strip = False
self.output = output
self.url = url

self.data_dict = {"words": [], "urls": [], "strip": ".,\"'"}

self.outfile = open(self.output, "w")
except IOError:
print "Can't write the file. Do you have write access?"

def url_magic(self, url, depth):
""" Do the URL boogie all night long """

domain = self.url.split("/")[0]+"//"+self.url.split("/")[2]

if url.startswith("/"):
crawl_url = domain+url
elif url.startswith(domain):
crawl_url = url

if crawl_url not in self.data_dict.get("urls"):
link_worker = Thread(target=self.request,
args=(crawl_url, int(depth)-1))

def request(self, url, depth):
""" Do request, get content, spread the word """

if depth < 0:

if url.startswith("/"):
url_split = url.split("/")
url = url_split[0] + "//" + url_split[2]

print "[+] URL: %s" % url

headers = {"user-agent": self.user_agent}
req = requests.get(url, headers=headers, timeout=3)
except requests.ConnectionError:
print "[+] Connection error, returning."
except requests.HTTPError:
print "[+] Invalid HTTP response, returning."
except requests.Timeout:
print "[+] Request timed out, returning."
except requests.TooManyRedirects:
print "[+] Too many redirections, returning."

if "text/html" not in req.headers.get("content-type"):
print "[+] Content type is not text/html, returning."

soup = BeautifulSoup(req.text, "html.parser")
for invalid_tags in soup(["script", "iframe", "style"]):

for link in soup.find_all("a"):
if not isinstance(link.get("href"), type(None)):
self.url_magic(link.get("href"), depth)

data_worker = Thread(target=self.parse_data,
args=(soup.get_text(), ))

def parse_data(self, data):
""" Parse the data after request """
data = data.replace("\r\n", " ").replace("\n", " ").split()

for word in data:
word = word.encode("utf-8")
if word not in self.data_dict.get("words"):
if len(word) >= self.min_length:
if self.with_strip == True:
stripped = word
for char in self.data_dict.get("strip"):
stripped = stripped.strip(char)
if self.with_strip == True and stripped != word:

def run(self, depth=0):
""" Run, scraper, run! """
self.request(self.url, depth)

if __name__ == "__main__":
PARSER = argparse.ArgumentParser(description="word scraper/wordlist\
PARSER.add_argument("--min-length", type=int, default=4, help="minimum\
word length, defaults to 4")
PARSER.add_argument("--user-agent", help="user agent to use on requests")
PARSER.add_argument("--with-strip", action="store_true", help="also store\
the stripped word")
PARSER.add_argument("--write", "-w", required=True, dest="file",
help="file to write the content in")
PARSER.add_argument("--depth", default=0, help="crawling depth, defaults\
to 0")
PARSER.add_argument("url", type=str, help="url to scrape")

ARGS = PARSER.parse_args()

SCRAPER = WordSpider(ARGS.file, ARGS.url)

if ARGS.min_length is not None:
SCRAPER.min_length = ARGS.min_length
if ARGS.user_agent is not None:
SCRAPER.user_agent = ARGS.user_agent
if ARGS.with_strip == True:
SCRAPER.with_strip = True


Edited by pr00f
