Jump to content
moubik

simplu mail crawler in php

Recommended Posts

Posted

am gasit la un moment dat pe acest site muler mail crawler

si nu mi-a placut foarte mult

mi-am scris eu acest programel

http://www.wikiupload.com/download_page.php?id=211708


<?
$_par = "";

$startLink = "";
$maximumLinks = 0; // 0 for infinite
$userAgent = 1;
$baseOnly = false;
$baseUrl = "";

$mysql_tablename = "hawler_visitedlinks";
$mysql_server = "";
$mysql_user = "";
$mysql_pass = "";
$mysql_database = "";

//process parameters
foreach ($argv as $param)
{
if (empty($_par) == false)
{
$$_par = $param;
$_par = "";
}
if (in_array($param, array("-h", "-help", "--help", "-?", "/?"))) displayHelp();
if (in_array($param, array("-s", "-start"))) $_par = "startLink";
if (in_array($param, array("-m", "-max"))) $_par = "maximumLinks";
if (in_array($param, array("-u", "-useragent"))) $_par = "userAgent";
if (in_array($param, array("-b", "-baseurl")))
{
$baseOnly = true;
$_par = "baseUrl";
}
if (in_array($param, array("-sqlserver", "-dbserver"))) $_par = "mysql_server";
if (in_array($param, array("-sqluser", "-dbuser"))) $_par = "mysql_user";
if (in_array($param, array("-sqlpass", "-dbpass"))) $_par = "mysql_pass";
if (in_array($param, array("-sqldb", "-dbdb"))) $_par = "mysql_database";

}

if (!empty($mysql_server))
{
$mysql = mysql_connect($mysql_server, $mysql_user, $mysql_pass);
if (!$mysql)
{
die ("Error connecting:" . mysql_error());
}
else
{
echo "Connected to mysql database\n";
}
mysql_select_db($mysql_database);
echo mysql_error();

//creating table if it does not exist
echo "Creating table $mysql_tablename\n";
mysql_query("CREATE TABLE IF NOT EXISTS $mysql_tablename (link TEXT(500), UNIQUE (link(500)))");
echo mysql_error();
}

if (strpos($startLink, "http://") === false) $startLink = "http://" . $startLink;
if (substr_count($startLink, "/") <= 2)
{
$startLink = $startLink . "/";
}

echo "Startlink '$startLink'\n";
echo "User-Agent '$userAgent'\n";
echo "Maximum links '$maximumLinks'\n";


function displayHelp()
{
?>
Hawler, mail crawler
By Moubik

-h : help
-s, -start [url] : start link
-m, -max [number] : maximum number of links to hold in memory, 0 for infinite
-u, -useragent [1-3] : 1=Mozilla, 2=Googlebot, 3=Internet Explorer
-b, -baseurl [url] : string every link must contain
======================================================================================
If you use a database for saving the links you visited
-sqlserver [ip/dns] : sqlserver name
-sqluser [user] : user to login with
-sqlpass [pass] : pass to login with
-sqldb [databasename] : database to save to

2007
<?
exit();
}

function initCurlVariable($url)
{
$auxCurl = curl_init ( $url );
curl_setopt ( $auxCurl, CURLOPT_HEADER, 0 );
curl_setopt ( $auxCurl, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ( $auxCurl, CURLOPT_FOLLOWLOCATION, 1);
global $userAgent;
if ($userAgent == 1) curl_setopt( $auxCurl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.7) Gecko/20070914 Firefox/2.0.0.7");
if ($userAgent == 2) curl_setopt( $auxCurl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)");
if ($userAgent == 3) curl_setopt( $auxCurl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; Googlebot/2.1; +[url]http://www.google.com/bot.html[/url])");

return $auxCurl;
}

$lastlink = $startLink;
$pg = $startLink;


$curlPage = initCurlVariable($startLink);
$page = curl_exec( $curlPage );
//echo $page;
$line_links = "";

$siteQueue = array("");
$visitedSites = array("");
$siteNumber = 1;
$total_links = 0;
$pageTemp = "";

$mails = array();
$dumpMails = fopen ("mails.txt", "a");
$dumpLinks = fopen ("links.txt", "a");
$dumpAbsoluteLinks = fopen ("absolution.txt", "a");

function parseMail($mailArray)
{
global $mails;
foreach ($mailArray as $mail)
{
if (is_array($mail)) parseMail($mail);
else
{
global $dumpMails;
$mail = str_replace("//edit.yahoo.com/config/send_webmesg?.target=", "", $mail);
$mail = str_replace("/", "", $mail);
$mail = str_replace("%20", "", $mail);
array_push($mails, $mail);
fprintf($dumpMails, "%s\n", $mail);
}
}
return $mails;
}

function captureMails($pageG)
{
$mailResults = "";

$regex_cm = '/(?:[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+'
.'(?:\.[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+)*@'
.'(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+(?:[a-z]{2}|'
.'com|org|net|gov|biz|info|name|aero|biz|info|jobs|'
.'museum)\b)/i';

preg_match_all($regex_cm, $pageG, $mailResults);
return parseMail($mailResults);

}

function transformToAbsoluteLink($link)
{
global $lastlink;
global $dumpAbsoluteLinks;
$fromRoot = false;
if ($link[0]=="/") {$fromRoot = true;}

if ($fromRoot)
{
$prefix = substr($lastlink, 0, strpos($lastlink, "/", 8)+1);
if (empty($prefix)) $prefix = $lastlink . "/";
if ($link[0]=="/") {$link = substr($link, 1);}
$link = $prefix . $link;
}
else
{
$prefix = substr($lastlink, 0, strrpos($lastlink, "/")+1);
if (strcmp($prefix, "http://") == 0) $prefix = $lastlink . "/";

if ($link[0]==".") {$link = substr($link, 1);}
if ($link[0]=="/") {$link = substr($link, 1);}


if (!preg_match("/http:\/\/.*/", $link))
{
fprintf($dumpAbsoluteLinks, "%s %s\n", $prefix, $link);
$link = $prefix . $link;
}
}
return $link;
}

function linearizelinks($item, $key)
{
global $line_links;
$line_links .= " $item";
}

while ( count( $siteQueue ) > 0 )
{
//capturam linkurile
$link_results = "";
preg_match_all("/href=\"(.*?)\"/", $page, $link_results);
$line_links = "";
array_walk_recursive($link_results[1], 'linearizelinks');
captureMails($page);
captureMails($line_links);

// captureMails()
foreach ( $link_results[1] as $link )
{
//stupid validation of link
$validLink = true;
if ( preg_match( "/^(mailto:|javascript:|news:)/i", $link ) )
{
$validLink = false;
}
elseif ( preg_match( "/\.(jpg|gif|png|ico|jpeg|pdf)$/i", $link ) )
{
$validLink = false;
}
elseif ( preg_match( "/\.(zip|rar|tar|gz)$/i", $link ) )
{
$validLink = false;
}
elseif ( preg_match( "/\.(c|pl|py|js|reg|orig)$/i", $link ) )
{
$validLink = false;
}
elseif ( preg_match( "/\.(exe|java|class)$/i", $link ) )
{
$validLink = false;
}
elseif ( preg_match( "/\.(css)$/i", $link ) )
{
$validLink = false;
}
elseif ( preg_match( "/\.(mp3|wav|ra|pm|mov|avi|aac|wmv)$/i", $link ) )
{
$validLink = false;
}
elseif ( !preg_match( "/\./", $link ) )
{
$validLink = false;
}

if ((($siteNumber < $maximumLinks) || ($maximumLinks == 0)) && ($validLink == true ) )
{
$siteNumber++;
$link = transformToAbsoluteLink($link);
if ($baseOnly == true ) $sameBase = strpos($link, $baseUrl);
if (($baseOnly == true) && ($sameBase === false)) $validLink = false;

if (!empty($mysql_server))
{
mysql_query("INSERT INTO $mysql_tablename (link) values('".mysql_real_escape_string("$link")."')");
//if the link is unique
if (!mysql_error())
{
fprintf($dumpLinks, "%s\n", $link);
array_push($siteQueue, $link);
$total_links++;
if ($total_links % 100 == 0) echo "visited links = $total_links\n";
}
else
{
$siteNumber--;
}
}
else
{
if ((!in_array($link, $visitedSites)) && ($validLink == true))
{

array_push($siteQueue, $link);
fprintf($dumpLinks, "%s\n", $link);
array_push($visitedSites, $link);
$total_links++;
if ($total_links % 100 == 0) echo "visited links = $total_links\n";
}
else $siteNumber--;
}
}
else
{

}
}

//capturam pagina
$pg = array_shift ( $siteQueue );
$lastlink = $pg;
echo "$pg\n";
$siteNumber--;
$curlPage = initCurlVariable($pg);
$page = curl_exec( $curlPage );
// echo $page;
curl_close($curlPage);
if (strcmp($page, $pageTemp) == 0)
{
$page = "";
$pageTemp = "";
}
else
{
$pageTemp = $page;
}
}

$mails = array_unique($mails);
print_r($mails);

?>

Posted

sysghost in betzia lui nu se refera la complexitatea codului sursa ci la inutilitatea acestuia ... nu se merita pierderea de timp pentru un astfel de tool cand sunt altele deja facute e ca si cum ai reinventa apa calda si branza de vaca ...

Posted

intradevar a fost distractiv sa-l programez.

am trecut prin cateva chestii php pe care nu le stiam.

si eu atunci cand m-am apucat stiam doar de muler email si nici nu am mai cautat ceva asemanator ^_^

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Paste as plain text instead

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.



×
×
  • Create New...