am gasit la un moment dat pe acest site muler mail crawler si nu mi-a placut foarte mult mi-am scris eu acest programel http://www.wikiupload.com/download_page.php?id=211708 <? $_par = ""; $startLink = ""; $maximumLinks = 0; // 0 for infinite $userAgent = 1; $baseOnly = false; $baseUrl = ""; $mysql_tablename = "hawler_visitedlinks"; $mysql_server = ""; $mysql_user = ""; $mysql_pass = ""; $mysql_database = ""; //process parameters foreach ($argv as $param) { if (empty($_par) == false) { $$_par = $param; $_par = ""; } if (in_array($param, array("-h", "-help", "--help", "-?", "/?"))) displayHelp(); if (in_array($param, array("-s", "-start"))) $_par = "startLink"; if (in_array($param, array("-m", "-max"))) $_par = "maximumLinks"; if (in_array($param, array("-u", "-useragent"))) $_par = "userAgent"; if (in_array($param, array("-b", "-baseurl"))) { $baseOnly = true; $_par = "baseUrl"; } if (in_array($param, array("-sqlserver", "-dbserver"))) $_par = "mysql_server"; if (in_array($param, array("-sqluser", "-dbuser"))) $_par = "mysql_user"; if (in_array($param, array("-sqlpass", "-dbpass"))) $_par = "mysql_pass"; if (in_array($param, array("-sqldb", "-dbdb"))) $_par = "mysql_database"; } if (!empty($mysql_server)) { $mysql = mysql_connect($mysql_server, $mysql_user, $mysql_pass); if (!$mysql) { die ("Error connecting:" . mysql_error()); } else { echo "Connected to mysql database\n"; } mysql_select_db($mysql_database); echo mysql_error(); //creating table if it does not exist echo "Creating table $mysql_tablename\n"; mysql_query("CREATE TABLE IF NOT EXISTS $mysql_tablename (link TEXT(500), UNIQUE (link(500)))"); echo mysql_error(); } if (strpos($startLink, "http://") === false) $startLink = "http://" . $startLink; if (substr_count($startLink, "/") <= 2) { $startLink = $startLink . "/"; } echo "Startlink '$startLink'\n"; echo "User-Agent '$userAgent'\n"; echo "Maximum links '$maximumLinks'\n"; function displayHelp() { ?> Hawler, mail crawler By Moubik -h : help -s, -start [url] : start link -m, -max [number] : maximum number of links to hold in memory, 0 for infinite -u, -useragent [1-3] : 1=Mozilla, 2=Googlebot, 3=Internet Explorer -b, -baseurl [url] : string every link must contain ====================================================================================== If you use a database for saving the links you visited -sqlserver [ip/dns] : sqlserver name -sqluser [user] : user to login with -sqlpass [pass] : pass to login with -sqldb [databasename] : database to save to 2007 <? exit(); } function initCurlVariable($url) { $auxCurl = curl_init ( $url ); curl_setopt ( $auxCurl, CURLOPT_HEADER, 0 ); curl_setopt ( $auxCurl, CURLOPT_RETURNTRANSFER, 1); curl_setopt ( $auxCurl, CURLOPT_FOLLOWLOCATION, 1); global $userAgent; if ($userAgent == 1) curl_setopt( $auxCurl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.7) Gecko/20070914 Firefox/2.0.0.7"); if ($userAgent == 2) curl_setopt( $auxCurl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)"); if ($userAgent == 3) curl_setopt( $auxCurl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; Googlebot/2.1; +[url]http://www.google.com/bot.html[/url])"); return $auxCurl; } $lastlink = $startLink; $pg = $startLink; $curlPage = initCurlVariable($startLink); $page = curl_exec( $curlPage ); //echo $page; $line_links = ""; $siteQueue = array(""); $visitedSites = array(""); $siteNumber = 1; $total_links = 0; $pageTemp = ""; $mails = array(); $dumpMails = fopen ("mails.txt", "a"); $dumpLinks = fopen ("links.txt", "a"); $dumpAbsoluteLinks = fopen ("absolution.txt", "a"); function parseMail($mailArray) { global $mails; foreach ($mailArray as $mail) { if (is_array($mail)) parseMail($mail); else { global $dumpMails; $mail = str_replace("//edit.yahoo.com/config/send_webmesg?.target=", "", $mail); $mail = str_replace("/", "", $mail); $mail = str_replace("%20", "", $mail); array_push($mails, $mail); fprintf($dumpMails, "%s\n", $mail); } } return $mails; } function captureMails($pageG) { $mailResults = ""; $regex_cm = '/(?:[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+' .'(?:\.[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+)*@' .'(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+(?:[a-z]{2}|' .'com|org|net|gov|biz|info|name|aero|biz|info|jobs|' .'museum)\b)/i'; preg_match_all($regex_cm, $pageG, $mailResults); return parseMail($mailResults); } function transformToAbsoluteLink($link) { global $lastlink; global $dumpAbsoluteLinks; $fromRoot = false; if ($link[0]=="/") {$fromRoot = true;} if ($fromRoot) { $prefix = substr($lastlink, 0, strpos($lastlink, "/", 8)+1); if (empty($prefix)) $prefix = $lastlink . "/"; if ($link[0]=="/") {$link = substr($link, 1);} $link = $prefix . $link; } else { $prefix = substr($lastlink, 0, strrpos($lastlink, "/")+1); if (strcmp($prefix, "http://") == 0) $prefix = $lastlink . "/"; if ($link[0]==".") {$link = substr($link, 1);} if ($link[0]=="/") {$link = substr($link, 1);} if (!preg_match("/http:\/\/.*/", $link)) { fprintf($dumpAbsoluteLinks, "%s %s\n", $prefix, $link); $link = $prefix . $link; } } return $link; } function linearizelinks($item, $key) { global $line_links; $line_links .= " $item"; } while ( count( $siteQueue ) > 0 ) { //capturam linkurile $link_results = ""; preg_match_all("/href=\"(.*?)\"/", $page, $link_results); $line_links = ""; array_walk_recursive($link_results[1], 'linearizelinks'); captureMails($page); captureMails($line_links); // captureMails() foreach ( $link_results[1] as $link ) { //stupid validation of link $validLink = true; if ( preg_match( "/^(mailto:|javascript:|news:)/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(jpg|gif|png|ico|jpeg|pdf)$/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(zip|rar|tar|gz)$/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(c|pl|py|js|reg|orig)$/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(exe|java|class)$/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(css)$/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(mp3|wav|ra|pm|mov|avi|aac|wmv)$/i", $link ) ) { $validLink = false; } elseif ( !preg_match( "/\./", $link ) ) { $validLink = false; } if ((($siteNumber < $maximumLinks) || ($maximumLinks == 0)) && ($validLink == true ) ) { $siteNumber++; $link = transformToAbsoluteLink($link); if ($baseOnly == true ) $sameBase = strpos($link, $baseUrl); if (($baseOnly == true) && ($sameBase === false)) $validLink = false; if (!empty($mysql_server)) { mysql_query("INSERT INTO $mysql_tablename (link) values('".mysql_real_escape_string("$link")."')"); //if the link is unique if (!mysql_error()) { fprintf($dumpLinks, "%s\n", $link); array_push($siteQueue, $link); $total_links++; if ($total_links % 100 == 0) echo "visited links = $total_links\n"; } else { $siteNumber--; } } else { if ((!in_array($link, $visitedSites)) && ($validLink == true)) { array_push($siteQueue, $link); fprintf($dumpLinks, "%s\n", $link); array_push($visitedSites, $link); $total_links++; if ($total_links % 100 == 0) echo "visited links = $total_links\n"; } else $siteNumber--; } } else { } } //capturam pagina $pg = array_shift ( $siteQueue ); $lastlink = $pg; echo "$pg\n"; $siteNumber--; $curlPage = initCurlVariable($pg); $page = curl_exec( $curlPage ); // echo $page; curl_close($curlPage); if (strcmp($page, $pageTemp) == 0) { $page = ""; $pageTemp = ""; } else { $pageTemp = $page; } } $mails = array_unique($mails); print_r($mails); ?>