moubik Posted September 29, 2007 Report Posted September 29, 2007 am gasit la un moment dat pe acest site muler mail crawlersi nu mi-a placut foarte multmi-am scris eu acest programelhttp://www.wikiupload.com/download_page.php?id=211708<?$_par = "";$startLink = "";$maximumLinks = 0; // 0 for infinite$userAgent = 1;$baseOnly = false;$baseUrl = "";$mysql_tablename = "hawler_visitedlinks";$mysql_server = "";$mysql_user = "";$mysql_pass = "";$mysql_database = "";//process parametersforeach ($argv as $param){ if (empty($_par) == false) { $$_par = $param; $_par = ""; } if (in_array($param, array("-h", "-help", "--help", "-?", "/?"))) displayHelp(); if (in_array($param, array("-s", "-start"))) $_par = "startLink"; if (in_array($param, array("-m", "-max"))) $_par = "maximumLinks"; if (in_array($param, array("-u", "-useragent"))) $_par = "userAgent"; if (in_array($param, array("-b", "-baseurl"))) { $baseOnly = true; $_par = "baseUrl"; } if (in_array($param, array("-sqlserver", "-dbserver"))) $_par = "mysql_server"; if (in_array($param, array("-sqluser", "-dbuser"))) $_par = "mysql_user"; if (in_array($param, array("-sqlpass", "-dbpass"))) $_par = "mysql_pass"; if (in_array($param, array("-sqldb", "-dbdb"))) $_par = "mysql_database";}if (!empty($mysql_server)){ $mysql = mysql_connect($mysql_server, $mysql_user, $mysql_pass); if (!$mysql) { die ("Error connecting:" . mysql_error()); } else { echo "Connected to mysql database\n"; } mysql_select_db($mysql_database); echo mysql_error(); //creating table if it does not exist echo "Creating table $mysql_tablename\n"; mysql_query("CREATE TABLE IF NOT EXISTS $mysql_tablename (link TEXT(500), UNIQUE (link(500)))"); echo mysql_error();}if (strpos($startLink, "http://") === false) $startLink = "http://" . $startLink;if (substr_count($startLink, "/") <= 2){ $startLink = $startLink . "/";}echo "Startlink '$startLink'\n";echo "User-Agent '$userAgent'\n";echo "Maximum links '$maximumLinks'\n";function displayHelp(){?>Hawler, mail crawlerBy Moubik -h : help -s, -start [url] : start link -m, -max [number] : maximum number of links to hold in memory, 0 for infinite -u, -useragent [1-3] : 1=Mozilla, 2=Googlebot, 3=Internet Explorer -b, -baseurl [url] : string every link must contain ====================================================================================== If you use a database for saving the links you visited -sqlserver [ip/dns] : sqlserver name -sqluser [user] : user to login with -sqlpass [pass] : pass to login with -sqldb [databasename] : database to save to2007<? exit();}function initCurlVariable($url){ $auxCurl = curl_init ( $url ); curl_setopt ( $auxCurl, CURLOPT_HEADER, 0 ); curl_setopt ( $auxCurl, CURLOPT_RETURNTRANSFER, 1); curl_setopt ( $auxCurl, CURLOPT_FOLLOWLOCATION, 1); global $userAgent; if ($userAgent == 1) curl_setopt( $auxCurl, CURLOPT_USERAGENT, "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.7) Gecko/20070914 Firefox/2.0.0.7"); if ($userAgent == 2) curl_setopt( $auxCurl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)"); if ($userAgent == 3) curl_setopt( $auxCurl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; Googlebot/2.1; +[url]http://www.google.com/bot.html[/url])"); return $auxCurl;}$lastlink = $startLink;$pg = $startLink;$curlPage = initCurlVariable($startLink);$page = curl_exec( $curlPage );//echo $page;$line_links = "";$siteQueue = array("");$visitedSites = array("");$siteNumber = 1;$total_links = 0;$pageTemp = "";$mails = array();$dumpMails = fopen ("mails.txt", "a");$dumpLinks = fopen ("links.txt", "a");$dumpAbsoluteLinks = fopen ("absolution.txt", "a");function parseMail($mailArray){ global $mails; foreach ($mailArray as $mail) { if (is_array($mail)) parseMail($mail); else { global $dumpMails; $mail = str_replace("//edit.yahoo.com/config/send_webmesg?.target=", "", $mail); $mail = str_replace("/", "", $mail); $mail = str_replace("%20", "", $mail); array_push($mails, $mail); fprintf($dumpMails, "%s\n", $mail); } } return $mails;}function captureMails($pageG){ $mailResults = ""; $regex_cm = '/(?:[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+' .'(?:\.[a-z0-9!#$%&\'*+\/=?^_`{|}~-]+)*@' .'(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+(?:[a-z]{2}|' .'com|org|net|gov|biz|info|name|aero|biz|info|jobs|' .'museum)\b)/i'; preg_match_all($regex_cm, $pageG, $mailResults); return parseMail($mailResults);}function transformToAbsoluteLink($link){ global $lastlink; global $dumpAbsoluteLinks; $fromRoot = false; if ($link[0]=="/") {$fromRoot = true;} if ($fromRoot) { $prefix = substr($lastlink, 0, strpos($lastlink, "/", 8)+1); if (empty($prefix)) $prefix = $lastlink . "/"; if ($link[0]=="/") {$link = substr($link, 1);} $link = $prefix . $link; } else { $prefix = substr($lastlink, 0, strrpos($lastlink, "/")+1); if (strcmp($prefix, "http://") == 0) $prefix = $lastlink . "/"; if ($link[0]==".") {$link = substr($link, 1);} if ($link[0]=="/") {$link = substr($link, 1);} if (!preg_match("/http:\/\/.*/", $link)) { fprintf($dumpAbsoluteLinks, "%s %s\n", $prefix, $link); $link = $prefix . $link; } } return $link;}function linearizelinks($item, $key){ global $line_links; $line_links .= " $item";}while ( count( $siteQueue ) > 0 ){ //capturam linkurile $link_results = ""; preg_match_all("/href=\"(.*?)\"/", $page, $link_results); $line_links = ""; array_walk_recursive($link_results[1], 'linearizelinks'); captureMails($page); captureMails($line_links); // captureMails() foreach ( $link_results[1] as $link ) { //stupid validation of link $validLink = true; if ( preg_match( "/^(mailto:|javascript:|news:)/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(jpg|gif|png|ico|jpeg|pdf)$/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(zip|rar|tar|gz)$/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(c|pl|py|js|reg|orig)$/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(exe|java|class)$/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(css)$/i", $link ) ) { $validLink = false; } elseif ( preg_match( "/\.(mp3|wav|ra|pm|mov|avi|aac|wmv)$/i", $link ) ) { $validLink = false; } elseif ( !preg_match( "/\./", $link ) ) { $validLink = false; } if ((($siteNumber < $maximumLinks) || ($maximumLinks == 0)) && ($validLink == true ) ) { $siteNumber++; $link = transformToAbsoluteLink($link); if ($baseOnly == true ) $sameBase = strpos($link, $baseUrl); if (($baseOnly == true) && ($sameBase === false)) $validLink = false; if (!empty($mysql_server)) { mysql_query("INSERT INTO $mysql_tablename (link) values('".mysql_real_escape_string("$link")."')"); //if the link is unique if (!mysql_error()) { fprintf($dumpLinks, "%s\n", $link); array_push($siteQueue, $link); $total_links++; if ($total_links % 100 == 0) echo "visited links = $total_links\n"; } else { $siteNumber--; } } else { if ((!in_array($link, $visitedSites)) && ($validLink == true)) { array_push($siteQueue, $link); fprintf($dumpLinks, "%s\n", $link); array_push($visitedSites, $link); $total_links++; if ($total_links % 100 == 0) echo "visited links = $total_links\n"; } else $siteNumber--; } } else { } } //capturam pagina $pg = array_shift ( $siteQueue ); $lastlink = $pg; echo "$pg\n"; $siteNumber--; $curlPage = initCurlVariable($pg); $page = curl_exec( $curlPage );// echo $page; curl_close($curlPage); if (strcmp($page, $pageTemp) == 0) { $page = ""; $pageTemp = ""; } else { $pageTemp = $page; }}$mails = array_unique($mails);print_r($mails);?> Quote
moubik Posted September 29, 2007 Author Report Posted September 29, 2007 ma asteptam sa comentezi spune-mi cum sa simplific/optimizez codul daca ai idei.sunt noob in phpaccept idei. Quote
escalation666 Posted September 30, 2007 Report Posted September 30, 2007 interesant mrclawner tau, but you are basic...you're wasted Quote
Guest BanKai Posted September 30, 2007 Report Posted September 30, 2007 sysghost in betzia lui nu se refera la complexitatea codului sursa ci la inutilitatea acestuia ... nu se merita pierderea de timp pentru un astfel de tool cand sunt altele deja facute e ca si cum ai reinventa apa calda si branza de vaca ... Quote
moubik Posted September 30, 2007 Author Report Posted September 30, 2007 intradevar a fost distractiv sa-l programez.am trecut prin cateva chestii php pe care nu le stiam.si eu atunci cand m-am apucat stiam doar de muler email si nici nu am mai cautat ceva asemanator Quote