generateDictionary.pl

Fi8sVrs · December 2, 2011

Description:

Script to create one sorted and unique wordlist from multiple wordlists. It takes as input N raw text files, a directory of them, or both. It parses the input and filters by string length on min and max specified. It will then sort all the data and make it unique for final output. L33t Speak is an option, see usage statement in prog.

Features:

L33t Speak
Supports mix of standard text with L33t Speak
Support for individual files, directory of files or both
Filters output to size mandated by user
Ensures uniqueness of output data
Sorts output

#!/usr/bin/perl

# Script to create one sorted and unique wordlist from multiple wordlists.
# It takes as input N raw text files, a directory of them, or both.
# It parses the input and filters by string length on min and max specified.
# It will then sort all the data and make it unique for final output.
# L33t Speak is an option, see usage statement below.
#							     
# Author: Andres Andreu <andres [at] neurofuzz dot com>
# File:   generateDictionary.pl
# Ver:	  1.0
# Usage:  perl generateDictionary.pl -file TEXT_FILE_1 -file TEXT_FILE_2 ... -file TEXT_FILE_N | -txtdir DIR_WHERE_FILES_ARE
#                                    -min MIN_WORD_LENGTH -max MAX_WORD_LENGTH -out OUTPUT_FILE
#                                    [-l33t [1 | 5 | 7 | 9]] [-mix 1]  
#

use strict;
use Getopt::Long; 

my ($min, $max, $fout, $txt_dir, $use_dir,
    $counter, $fcounter, $raw_dict, $key, $var, %count,
    $use_l33t, $l33t_val, $use_mix, $LEVEL);
$counter = $fcounter = $use_mix = 0;
my @rawfiles = ();
$LEVEL = 5;
Getopt::Long::Configure ("permute");

###############################################################################
# taken from Lingua::31337 by Casey West
# C4S3y R. WES7 <C4SeY@g33KN3S7.c0m>
my %CONVERSIONS = (

  # handle the vowels
  1   => {
          mixcase => 0,
          chars => {
                    a => 4,
                    e => 3,
                    i => 1,
                    o => 0,
                   },
         },

  # Handle vowels and some consonants,
  # don't use punctuation in the translation,
  # shift case at random.
  5   => {
          mixcase => 1,
          chars => {
                    a => 4,
                    e => 3,
                    f => 'ph',
                    i => 1,
                    l => 1,
                    o => 0,
                    's$' => 'z',
                    t => 7,
                   },
         },

  # Handle vowels and most consonants,
  # use punctuation in the translation,
  # shift case at random,
  # convert some letters.
  7   => {
          mixcase => 1,
          chars => {
                    a => 4,
                    b => '|3',
                    d => '|)',
                    e => 3,
                    f => 'ph',
                    h => '|-|',
                    i => 1,
                    k => '|<',
                    l => 1,
                    'm' => '|\/|',
                    n => '|\|',
                    o => 0,
                    's$' => 'z',
                    t => '-|-',
                    v => '\/',
                    w => '\/\/',
                    x => '><',
                   },
         },

  # Handle vowels and most consonants,
  # use punctuation in the translation,
  # shift case at random,
  # convert some letters to others,
  # decide between several options.
  9   => {
          mixcase => 1,
          chars => {
                    a => [ 4, 'aw' ],
                    b => '|3',
                    ck => 'x',
                    'ck$' => 'x0rz',
                    d => '|)',
                    e => [ 3, 0, 'o' ],
                    'ed$' => 'z0r3d',
                    'er$' => '0r',
                    f => 'ph',
                    h => '|-|',
                    i => 1,
                    k => '|<',
                    l => 1,
                    'm' => '|\/|',
                    n => '|\|',
                    o => 0,
                    's' => 'z',
                    t => '-|-',
                    v => '\/',
                    w => '\/\/',
                    x => '><',
                   },
         },
);

sub text231337 {
  my @text     = @_;
  my @new_text = ();

  $LEVEL-- until exists $CONVERSIONS{$LEVEL};

  foreach my $line ( @text ) {
    foreach ( keys %{$CONVERSIONS{$LEVEL}->{chars}} ) {
      if ( ref $CONVERSIONS{$LEVEL}->{chars}->{$_} ) {
        $line =~ s/($_)/(0,1)[rand 2] ? @{$CONVERSIONS{$LEVEL}->{chars}{$_}}[rand $#{$CONVERSIONS{$LEVEL}->{chars}{$_}}] : $1/egi;
      } else {
        $line =~ s/($_)/(0,1)[rand 2] ? $CONVERSIONS{$LEVEL}->{chars}{$_} : $1/egi;
      }
    }
    $line =~ s/([A-Z])/(0,1)[rand 2] ? uc($1) : lc($1)/egi if $CONVERSIONS{$LEVEL}->{mixcase};
    push @new_text, $line;
  }
  return @new_text;
}
# End of module Lingua::31337 usage
###############################################################################

sub usage(){
	print "\nUsage: perl generateDictionary.pl -file TEXT_FILE_1 -file TEXT_FILE_2 ... -file TEXT_FILE_N | -txtdir DIR_WHERE_FILES_ARE " .
	      "-min MIN_WORD_LENGTH -max MAX_WORD_LENGTH [-l33t [1 | 5 | 7 | 9]] [-mix 1] " .
	      "-out OUTPUT_FILE\n\n" .
	      "-min & -max establish the string length range you want your data filtered on\n" .
	      "-out sets the resource for final data output\n" .
	      "-file sets individual files to feed the final dictionary (enter as many as you like)" .
	      "[-txtdir] the path to a directory where some source dictionary files exist\n" .
	      "[-l33t] sets on the option to use L33t Speak translation, valid values are 1,5,7,9\n" .
	      "[-mix] sets on the option to use both L33T Speak and clear text strings simultaneously\n" .
	      "when the \"-mix\" switch is used, the \"-l33t\" switch MUST also be used\n\n";
	exit();
}

#Define initial hash
my %opts=();
GetOptions(\%opts,"min=i",
		"max=i",
		"file=s" => \@rawfiles,
		"txtdir=s",
		"l33t=s",
		"mix=s",
		"out=s");

#Display Usage if no options were passed
if(keys(%opts)==0) {
	usage();
}


#Process the options
if (defined($opts{min})) {
	$min = $opts{min};
}

if (defined($opts{max})) { 
	$max = $opts{max};
}

if (defined($opts{file})) { 
	push(@rawfiles, $opts{file});
}

if (defined($opts{txtdir})) { 
	$txt_dir = $opts{txtdir};
	$use_dir = 1; 
}

if (defined($opts{l33t})) { 
	$l33t_val = $opts{l33t};
	$use_l33t = 1;
}

if (defined($opts{mix})) { 
	$use_mix = 1;
}

if (defined($opts{out})) { 
	$fout = $opts{out};
}


#Handle conditions
if ((!$min) || (!$max) || (!$fout)) {
	usage();
}

if ((!@rawfiles) && (!$txt_dir)) {
	usage();
}

if (($use_mix) && (!$use_l33t)) { 
	print "If you want to use the \"mix\" mode for both normal strings " .
	      "and L33t Speak combined, then you must use the \"-l33t\" switch \n\n";
	usage();
}

if ($use_l33t) {
	if (($l33t_val != 1) && ($l33t_val != 5) && 
	    ($l33t_val != 7) && ($l33t_val != 9)) {
		    print "The only acceptable values for the L33t Speak option are " .
		    	  "1 | 5 | 7 | 9 \n\n";
		    usage();
	}
	$LEVEL = $l33t_val;
}

open( FINALDICT, ">$fout") or die "Can't open output file $fout...";
# dir listing used
if ($use_dir) {
	opendir(DIRTXT,$txt_dir) or die "Can't access " . $txt_dir . "\n";
	foreach (readdir(DIRTXT)){
		push(@rawfiles, $txt_dir . $_); 
	} 
	closedir (DIRTXT);
}

# iterate through each text file to be parsed
# ignore this prog and hidden files via regex
foreach $raw_dict (@rawfiles) {	
	# no need to process these
	# regex out hidden files as such: /.
	if(($raw_dict eq ".") || ($raw_dict eq "..") || 
	   ($raw_dict =~ "generateDictionary.pl") || 
	   ($raw_dict =~ m/\/\./)) {
		next;
	}
	# strip start and end white space
	$raw_dict =~ s/^\s+//;
	$raw_dict =~ s/\s+$//;
	#increment file counter
	$fcounter++;

	open(RAWDICT,$raw_dict) or die "Can't open input file $raw_dict\n";
	while ($var=<RAWDICT>) {
		$var =~ s/^\s+//;			
		$var =~ s/\s+$//;
		if ((length($var) <= $max) && (length($var) >= $min)) {
			if($var) {
				if (($use_l33t) && ($use_mix)) {
					$count{join("",text231337($var))}++;
					$count{$var}++;
				} elsif (($use_l33t) && (!$use_mix)) {
					$count{join("",text231337($var))}++;
				} else {
					$count{$var}++;	
				}
			}
		}			
	}
	close (RAWDICT);
}

# perl hashes enforce uniqueness and sort for us 
foreach $key (sort keys %count) {
   print FINALDICT "$key\n";
   $counter++;
}

print "\nYour sorted unique dictionary consists of data from " . 
      $fcounter . " raw files, is located in file \"" .
      $fout . "\" and has " . $counter . " items in it ...\n\n";	

close (FINALDICT);

Mirror

Sign In

generateDictionary.pl

Recommended Posts

Fi8sVrs

Join the conversation

Browse

Activity

Pages