Jump to content
pr00f

wospi - world wide web word crawler for generating wordlists

Recommended Posts

Am avut nevoie de un crawler care genereaza wordlist-uri in functie de continutul unei pagini/unui website, si tot ce am gasit a fost cewl, dar e scris in ruby si nu-mi place ideea de gem-uri (sunt fan dulceata), asa ca am decis sa scriu eu unul. Poate nu e la fel de bun/featureful precum cewl (sau altele?), dar functioneaza, si functioneaza bine.

vzOdWcM.png

Necesita requests si BeautifulSoup.

Sursa, cat si informatii despre setting up: https://github.com/vlad-s/wospi


"""
wospi 0.1

word spider whose sole purpose is to crawl for strings
and generate a wordlist
"""

__author__ = "Vlad <vlad at vlads dot me>"
__version__ = "0.1"
__license__ = "GPL v2"

# pylint: disable=import-error
# pylint can't find BeautifulSoup (installed with pip)

import requests
import argparse
from threading import Thread
from bs4 import BeautifulSoup

class WordSpider(object):
""" Main class """

def __init__(self, output, url):
self.min_length = 4
self.user_agent = "wospi (v0.1) word spiderbro"
self.with_strip = False
self.output = output
self.url = url

self.data_dict = {"words": [], "urls": [], "strip": ".,\"'"}

try:
self.outfile = open(self.output, "w")
except IOError:
print "Can't write the file. Do you have write access?"
exit(1)

def url_magic(self, url, depth):
""" Do the URL boogie all night long """

domain = self.url.split("/")[0]+"//"+self.url.split("/")[2]

if url.startswith("/"):
crawl_url = domain+url
elif url.startswith(domain):
crawl_url = url
else:
return

if crawl_url not in self.data_dict.get("urls"):
self.data_dict.get("urls").append(crawl_url)
link_worker = Thread(target=self.request,
args=(crawl_url, int(depth)-1))
link_worker.start()

def request(self, url, depth):
""" Do request, get content, spread the word """

if depth < 0:
exit(1)

if url.startswith("/"):
url_split = url.split("/")
url = url_split[0] + "//" + url_split[2]

print "[+] URL: %s" % url

headers = {"user-agent": self.user_agent}
try:
req = requests.get(url, headers=headers, timeout=3)
except requests.ConnectionError:
print "[+] Connection error, returning."
return
except requests.HTTPError:
print "[+] Invalid HTTP response, returning."
return
except requests.Timeout:
print "[+] Request timed out, returning."
return
except requests.TooManyRedirects:
print "[+] Too many redirections, returning."
return

if "text/html" not in req.headers.get("content-type"):
print "[+] Content type is not text/html, returning."
return

soup = BeautifulSoup(req.text, "html.parser")
for invalid_tags in soup(["script", "iframe", "style"]):
invalid_tags.extract()

for link in soup.find_all("a"):
if not isinstance(link.get("href"), type(None)):
self.url_magic(link.get("href"), depth)

data_worker = Thread(target=self.parse_data,
args=(soup.get_text(), ))
data_worker.start()

def parse_data(self, data):
""" Parse the data after request """
data = data.replace("\r\n", " ").replace("\n", " ").split()

for word in data:
word = word.encode("utf-8")
if word not in self.data_dict.get("words"):
if len(word) >= self.min_length:
if self.with_strip == True:
stripped = word
for char in self.data_dict.get("strip"):
stripped = stripped.strip(char)
self.data_dict.get("words").append(word)
self.outfile.write(word+"\n")
if self.with_strip == True and stripped != word:
self.data_dict.get("words").append(stripped)
self.outfile.write(stripped+"\n")

def run(self, depth=0):
""" Run, scraper, run! """
self.request(self.url, depth)

if __name__ == "__main__":
PARSER = argparse.ArgumentParser(description="word scraper/wordlist\
generator")
PARSER.add_argument("--min-length", type=int, default=4, help="minimum\
word length, defaults to 4")
PARSER.add_argument("--user-agent", help="user agent to use on requests")
PARSER.add_argument("--with-strip", action="store_true", help="also store\
the stripped word")
PARSER.add_argument("--write", "-w", required=True, dest="file",
help="file to write the content in")
PARSER.add_argument("--depth", default=0, help="crawling depth, defaults\
to 0")
PARSER.add_argument("url", type=str, help="url to scrape")

ARGS = PARSER.parse_args()

SCRAPER = WordSpider(ARGS.file, ARGS.url)

if ARGS.min_length is not None:
SCRAPER.min_length = ARGS.min_length
if ARGS.user_agent is not None:
SCRAPER.user_agent = ARGS.user_agent
if ARGS.with_strip == True:
SCRAPER.with_strip = True

SCRAPER.run(ARGS.depth)

Edited by pr00f
  • Upvote 1
Link to comment
Share on other sites

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.



×
×
  • Create New...