Hallo,
ich habe einen Webcrawler geschrieben, leider ist der Crawler total langsam. Ich denke es liegt an den MySql Querys. Habt Ihr eine Idee wie ich den Crawler verbessern kann??
Danke im Voraus.
ich habe einen Webcrawler geschrieben, leider ist der Crawler total langsam. Ich denke es liegt an den MySql Querys. Habt Ihr eine Idee wie ich den Crawler verbessern kann??
Danke im Voraus.
PHP-Code:
<?php
error_reporting(0);
/*
* The **** Crawler
* Edit: 17.07.2011
* Version: 0.1.6 Beta
*/
//Connect to the Database
$db = mysql_connect('localhost', '****', '*****');
mysql_select_db('*****');
//Set the UserAgent
ini_set("user_agent", "*****");
//The Crawler beginning hier
while(0 == 0){
$array = mysql_fetch_array(mysql_query('SELECT * FROM `temp` ORDER BY `id` LIMIT 1'));
mysql_query("DELETE FROM `temp` WHERE id = '" . $array['id'] . "'");
mysql_query("DELETE FROM `temp` WHERE url LIKE '" . $array['url'] . "'");
$url = $array['url'];
echo $url . '
';
//The BaseUrl
$parse_url = @parse_url($url);
$base_url = $parse_url['scheme'] . '://' . $parse_url['host'];
//The Content
if($content = @file_get_contents($url)){
if(robotstxt($url) == false){
$code = mb_detect_encoding($content, 'auto');
mysql_set_charset($code);
//The Links
$theLinkQuery = 'INSERT INTO `temp` (url) VALUES ';
$theBacklinksQuery = 'INSERT INTO `backlinks` (`from`, `to`) VALUES ';
preg_match_all('#<a.*?href="(.*?)".*?>.*?</a>#s',$content,$ergebnisse);
$ia = 0;
$ib = 0;
foreach ($ergebnisse['1'] as $link){
$link = explode('#', $link);
$link = $link['0'];
if (!parse_url($link, PHP_URL_HOST)) {
$link = $base_url . '/' . $link;
}
if($link != ''){
$link = str_replace('//', '/', $link);
$link = str_replace('http:/', 'http://', $link);
$link = str_replace('https:/', 'https://', $link);
$count_db = 0;
$backlinksc = mysql_num_rows(mysql_query('SELECT `id` FROM `backlinks` WHERE `from` = "' . $link . '" LIMIT 1'));
$count_db += mysql_num_rows(mysql_query('SELECT `id` FROM `index` WHERE `url` = "' . $link . '" LIMIT 1'));
$count_db += mysql_num_rows(mysql_query('SELECT `id` FROM `tocrawl` WHERE `url` = "' . $link . '" LIMIT 1'));
$count_db += $backlinksc;
if($count_db == 0){
if($ia > 0) $theLinkQuery .= ',';
$theLinkQuery .= '("' . mysql_escape_string($link) . '")';
$ia++;
}
if($backlinksc == 0){
if($ib > 0) $theBacklinksQuery .= ',';
$theBacklinksQuery .= '("' . mysql_escape_string($url) . '", "' . mysql_escape_string($link) . '")';
$ib++;
}
}
}
//Written Links to the DB
mysql_query($theLinkQuery);
//The URL to the Index
if(mysql_num_rows(mysql_query('SELECT `id` FROM `index` WHERE `url` = "' . $url . '" LIMIT 1')) == 0){
$metas = getMeta($content);
$cache = getTextCache($content);
//Get Metas an wirtten in a Varviable
if(isset($metas['title'])) $title = $metas['title'];
else $title = '';
if(isset($metas['description'])) $desc = $metas['description'];
else{
$words = explode(' ', $cache);
$desc = '';
for($i=0;$i<25;$i++){
$desc .= $words[$i].' ';
}
$desc .= '...';
}
//Written to the Database
mysql_query('INSERT INTO `index` (`url`, `title`, `desc`, `codirung`) VALUES ("' . mysql_escape_string($url) .'", "' . mb_convert_encoding(mysql_escape_string($title), 'utf-8') . '", "' . mb_convert_encoding(mysql_escape_string($desc), 'utf-8') . '", "' . $code . '")');
$id = mysql_insert_id();
mysql_query('INSERT INTO `cache` (`index_id`, `cache`) VALUES ("' . $id . '", "' . mysql_escape_string($content) . '")');
if(isset($metas['keywords'])) $cache .= ' ' . $metas['keywords'];
if(isset($url)) $cache .= ' ' . $url;
if(isset($metas['description'])) $cache .= ' ' . $metas['description'];
if(isset($metas['title'])) $cache .= ' ' . $metas['title'];
$cache .= ' ' . $url;
$cache=preg_replace("/[^a-zA-Z0-9_äöüÄÖÜ ]/" , "" , $cache);
for($lerr=0;$i<10;$i++){
$cache = str_replace(' ', ' ', $cache);
}
//TLD errausflitern
$coun = str_replace('www.', '', $parse_url['host']);
$coun = explode('.', $coun);
if ($coun['1'] == 'wikipedia') {
$coun = $coun['0'];
} else {
$coun = $coun[(count($coun) - 1)];
}
$words = explode(' ', $cache);
$rank = getRank($url, $metas);
$mysql_query = 'INSERT INTO `keyword_link` (`kid`, `iid`, `rank`, `country`) VALUES ';
for($i=0;$i<count($words);$i++){
if (!empty($words[$i])) {
$wordrank = $rank;
if(strpos($metas['description'], $words[$i]) !== false) $wordrank += 20;
if(strpos($metas['keywords'], $words[$i]) !== false) $wordrank += 25;
if(strpos($metas['title'], $words[$i]) !== false) $wordrank += 35;
if(strpos($parse_url['host'], $words[$i]) !== false) $wordrank += 300;
$wordrank += substr_count($cache, $words[$i]) * 2;
//Prüfen ob Keyword schon exesiert ansonsten eintragen!
$keywordcount = mysql_query('SELECT * FROM `keyword` WHERE `keyword` LIKE \'' . mb_convert_encoding(mysql_escape_string($words[$i]), 'utf-8') . '\'');
if(mysql_num_rows($keywordcount) == 0) {
$insetkey = mysql_query('INSERT INTO `keyword` (`keyword`) VALUES ("' . mb_convert_encoding(mysql_escape_string($words[$i]), 'utf-8') . '")');
$keyid = mysql_insert_id();
} else {
$row = mysql_fetch_array($keywordcount);
$keyid = $row['id'];
}
//Keywordlink Setzen
$mysql_query .= '("' . $keyid . '", "' . $id . '", "' . (mysql_escape_string($wordrank) + 200 ) . '", "' . mysql_escape_string($coun) . '"),';
$mysql_query .= '("' . $keyid . '", "' . $id . '", "' . mysql_escape_string($wordrank) . '", ""),';
}
}
$mysql_query = $string_neu = substr($mysql_query, 0 , strlen($mysql_query)-1);
mysql_query($mysql_query);
}
}
}else{
echo 'The Link not exist!';
}
}
//The Meta Function
function getMeta($quellencode){
if(preg_match_all("#<meta([^>]*)>#si", $quellencode, $matches)){
$meta = array();
foreach($matches[1] as $match){
preg_match("/(name|content)=\"(.*)\".*?(name|content)=\"(.*)\".*?/i", $match, $m);
if(isset($m[2]) AND isset($m[4])){
if($m[1] == 'name'){
if(isset($m[2]) AND isset($m[4])) $meta[$m[2]] = $m[4];
}else{
$meta[$m[4]] = $m[2];
}
}
}
if (preg_match("/<title>(.*)<\/title>/i", $quellencode, $title)) { $meta['title'] = $title[1]; }
return $meta;
}
}
//The Cache Function
function getTextCache($quellencode){
if(preg_match( "#<body.*?>(.*)<\/body>#ims", $quellencode, $matches )) $quellencode = $matches[0];
$quellencode = preg_replace("#<script[\/\!]*?[^<>]*?>[\/\!]*?[^<>]*?</script>#si"," ",$quellencode);
$quellencode = preg_replace("#<script.*?>.*?</script>#si"," ",$quellencode);
$quellencode = preg_replace("#<script>.*?</script>#si","",$quellencode);
$quellencode = preg_replace("#<style[\/\!]*?[^<>]*?>[\/\!]*?[^<>]*?</style>#si"," ",$quellencode);
$quellencode = preg_replace("#<style.*?>.*?</style>#si"," ",$quellencode);
$quellencode = preg_replace("#<style>.*?</style>#si"," ",$quellencode);
$quellencode = preg_replace('#style=".*?"#si', "",$quellencode);
$quellencode = preg_replace("#<.*?>#si"," ",$quellencode);
$quellencode = preg_replace('#\r|\n|\t#', ' ', $quellencode);
for($i=0;$i<=10;$i++) $quellencode = str_replace(' ', ' ', $quellencode);
return $quellencode;
}
//The robots.txt Function
function robotstxt($url){
$url = parse_url($url);
if(isset($url['path'])) $hinter = $url['path'];
else $hinter = '';
$basicurl = $url['scheme'].'://'.$url['host'];
$isok = 0;
$robots = @fopen($basicurl.'/robots.txt',"r");
while(!@feof($robots))
{
$zeile = @fgets($robots,1024);
if($isok == 1){
if (@strpos($zeile, 'Disallow:') !== false) {
if (@strpos($zeile, $hinter) !== false) {
return true;
}
}
}else{
if (@strpos($zeile, 'User-agent:') !== false) {
if (@strpos($zeile, '*') !== false) $isok = 1;
elseif (@strpos($zeile, '****') !== false) $isok = 1;
else $isok = 0;
}
}
if($zeile == '') break;
}
@fclose($robots);
return false;
}
function getRank($url, $meta){
$rank = 0;
if(isset($meta['description'])) $rank += 25;
if(isset($meta['keywords'])) $rank += 15;
$newurl = parse_url($url);
if(isset($newurl['path'])){
if($newurl['path'] == '/') $rank += 100;
else{
$explode = explode('/', $newurl['path']);
$minus = count($explode);
$rank -= $minus;
}
}else{
$rank += 100;
}
return $rank;
}
?>

Kommentar