我有一堆网站以字符串形式存储在3个mysql表中。我的脚本将它们放入数组,进行解析,提取所有链接并将它们分类为2个表。它分为3个相同的模块进行分类。
整个过程循环执行,每30秒执行一次操作。
由于某种原因,它仅是第一次按预期工作,后来没有任何反应。
在获取代码之前,我对折旧的mysql表示歉意,该脚本仅在本地计算机上使用,我将在适当的时候对其进行更新。
这是我的代码:
$i=1;
$domain1 = 'example1.com';
$domain2 = 'example2.com';
$domain3 = 'example3.com';
$robots1 = array("url1",
"url2",
"url3");
$robots2 = array("url1",
"url2",
"url3");
$robots3 = array("url1",
"url2",
"url3");
require_once 'Normalizer.php';
$conn = mysql_connect('localhost:3306','user', 'pass', true );
mysql_select_db( 't1000', $conn );
while ($i<=50000) {
$query = 'SELECT * FROM dump1';
$result1=mysql_query( $query, $conn );
$strings1=array();
while ($row = mysql_fetch_assoc($result1)) {
array_push($strings1, $row["link"]);
}
$query = 'TRUNCATE TABLE dump1';
$delete=mysql_query( $query, $conn );
$query = 'SELECT * FROM dump2';
$result1=mysql_query( $query, $conn );
$strings2=array();
while ($row = mysql_fetch_assoc($result1)) {
array_push($strings2, $row["link"]);
}
$query = 'TRUNCATE TABLE dump2';
$delete=mysql_query( $query, $conn );
$query = 'SELECT * FROM dump3';
$result1=mysql_query( $query, $conn );
$strings3=array();
while ($row = mysql_fetch_assoc($result1)) {
array_push($strings3, $row["link"]);
}
$query = 'TRUNCATE TABLE dump3';
$delete=mysql_query( $query, $conn );
// Module 1 start
$ii=0;
$links = array();
$edofollow = array();
$enofollow = array();
$internal = array();
foreach ($strings1 as $value)
{
$input=$strings1[$ii];
$htm=stripcslashes($input);
$doc = new DOMDocument();
@$doc->loadHTML($htm);
$arr = $doc->getElementsByTagName("a"); // DOMNodeList Object
foreach($arr as $item) { // DOMElement Object
$href = $item->getAttribute("href");
$rel = $item->getAttribute("rel");
$text = trim(preg_replace("/[\r\n]+/", " ", $item->nodeValue));
$links[] = array(
'href' => $href,
'rel' => $rel,
'text' => $text
);
if (strpos($href, '://')!==false AND strpos($href, $domain1)==false AND $rel!=='nofollow')
{
$un = new URL\Normalizer();
$un->setUrl( $href );
$href= parse_url($un->normalize(), PHP_URL_HOST);
array_push($edofollow, $href);
}
else if (strpos($href, '://')!==false AND strpos($href, $domain1)==false AND $rel=='nofollow')
{
$un1 = new URL\Normalizer();
$un1->setUrl( $href );
array_push($enofollow, $un1->normalize());
}
else if (strpos($href,'://')==false or strpos($href,$domain1)!==false)
{
$un2 = new URL\Normalizer();
$un2->setUrl( $href );
$href1=$un2->normalize();
if (strpos($href1, 'TRANSCRIPTS')==false AND strpos($href1, '(')==false AND strpos($href1, ')')==false AND strpos($href1, '#')==false AND strpos($href1, 'javascript')==false AND strpos($href1, '?')==false AND strpos($href1, 'void')==false)
{
if($href1=='' or $href1=='/')
{}
else{
if (strpos($href1, '://')==false)
{$href1='http://'.$domain1.$href1;}
if (in_array($href1, $robots1)) { }
else {
array_push($internal, $href1);
}
}
}
}
}
$uedofollow = array_values(array_unique($edofollow));
foreach ($uedofollow as $value) {
$query=mysql_query("select * from dofollow where link='".$value."' ");
$duplicate=0;
if($query){
$duplicate=mysql_num_rows($query);
}
if($duplicate==0)
{
$sql='INSERT INTO dofollow (link) VALUES ("'.$value.'")';
mysql_query( $sql, $conn );
}
}
$uinternal = array_values(array_unique($internal));
foreach ($uinternal as $value2) {
$query=mysql_query("select * from joblist1 where link='".$value2."' ");
if ($query) {
$duplicate=0;
$duplicate=mysql_num_rows($query);
if($duplicate==0)
{
$sql='INSERT INTO joblist1 (link) VALUES ("'.$value2.'")';
mysql_query( $sql, $conn );
}
}
}
$ii=$ii+1;
}
// Module 1 ends
// Module 2 start
$links = array();
$edofollow = array();
$enofollow = array();
$internal = array();
$ii=0;
foreach ($strings2 as $value)
{
$input=$strings2[$ii];
$htm=stripcslashes($input);
$doc = new DOMDocument();
@$doc->loadHTML($htm);
$arr = $doc->getElementsByTagName("a"); // DOMNodeList Object
foreach($arr as $item) { // DOMElement Object
$href = $item->getAttribute("href");
$rel = $item->getAttribute("rel");
$text = trim(preg_replace("/[\r\n]+/", " ", $item->nodeValue));
$links[] = array(
'href' => $href,
'rel' => $rel,
'text' => $text
);
if (strpos($href, '://')!==false AND strpos($href, $domain2)==false AND $rel!=='nofollow')
{
$un = new URL\Normalizer();
$un->setUrl( $href );
$href= parse_url($un->normalize(), PHP_URL_HOST);
array_push($edofollow, $href);
}
else if (strpos($href, '://')!==false AND strpos($href, $domain2)==false AND $rel=='nofollow')
{
$un1 = new URL\Normalizer();
$un1->setUrl( $href );
array_push($enofollow, $un1->normalize());
}
else if (strpos($href,'://')==false or strpos($href,$domain2)!==false)
{
$un2 = new URL\Normalizer();
$un2->setUrl( $href );
$href1=$un2->normalize();
if (strpos($href1, 'TRANSCRIPTS')==false AND strpos($href1, '(')==false AND strpos($href1, ')')==false AND strpos($href1, '#')==false AND strpos($href1, 'javascript')==false AND strpos($href1, '?')==false AND strpos($href1, 'void')==false)
{
if($href1=='' or $href1=='/')
{}
else{
if (strpos($href1, '://')==false)
{$href1='http://'.$domain2.$href1;}
if (in_array($href1, $robots2)) { }
else {
array_push($internal, $href1);
}
}
}
}
}
$uedofollow = array_values(array_unique($edofollow));
foreach ($uedofollow as $value) {
$query=mysql_query("select * from dofollow where link='".$value."' ");
$duplicate=0;
if($query){
$duplicate=mysql_num_rows($query);
}
if($duplicate==0)
{
$sql='INSERT INTO dofollow (link) VALUES ("'.$value.'")';
mysql_query( $sql, $conn );
}
}
$uinternal = array_values(array_unique($internal));
foreach ($uinternal as $value2) {
$query=mysql_query("select * from joblist2 where link='".$value2."' ");
if ($query) {
$duplicate=0;
$duplicate=mysql_num_rows($query);
if($duplicate==0)
{
$sql='INSERT INTO joblist2 (link) VALUES ("'.$value2.'")';
mysql_query( $sql, $conn );
}
}
}
$ii=$ii+1;
}
// Module 2 Ends
// Module 3 start
$links = array();
$edofollow = array();
$enofollow = array();
$internal = array();
$ii=0;
foreach ($strings3 as $value)
{
$input=$strings3[$ii];
$htm=stripcslashes($input);
$doc = new DOMDocument();
@$doc->loadHTML($htm);
$arr = $doc->getElementsByTagName("a"); // DOMNodeList Object
foreach($arr as $item) { // DOMElement Object
$href = $item->getAttribute("href");
$rel = $item->getAttribute("rel");
$text = trim(preg_replace("/[\r\n]+/", " ", $item->nodeValue));
$links[] = array(
'href' => $href,
'rel' => $rel,
'text' => $text
);
if (strpos($href, '://')!==false AND strpos($href, $domain3)==false AND $rel!=='nofollow')
{
$un = new URL\Normalizer();
$un->setUrl( $href );
$href= parse_url($un->normalize(), PHP_URL_HOST);
array_push($edofollow, $href);
}
else if (strpos($href, '://')!==false AND strpos($href, $domain3)==false AND $rel=='nofollow')
{
$un1 = new URL\Normalizer();
$un1->setUrl( $href );
array_push($enofollow, $un1->normalize());
}
else if (strpos($href,'://')==false or strpos($href,$domain3)!==false)
{
$un2 = new URL\Normalizer();
$un2->setUrl( $href );
$href1=$un2->normalize();
if (strpos($href1, 'TRANSCRIPTS')==false AND strpos($href1, '(')==false AND strpos($href1, ')')==false AND strpos($href1, '#')==false AND strpos($href1, 'javascript')==false AND strpos($href1, '?')==false AND strpos($href1, 'void')==false)
{
if($href1=='' or $href1=='/')
{}
else{
if (strpos($href1, '://')==false)
{$href1='http://'.$domain3.$href1;}
if (in_array($href1, $robots3)) { }
else {
array_push($internal, $href1);
}
}
}
}
}
$uedofollow = array_values(array_unique($edofollow));
foreach ($uedofollow as $value) {
$query=mysql_query("select * from dofollow where link='".$value."' ");
$duplicate=0;
if($query){
$duplicate=mysql_num_rows($query);
}
if($duplicate==0)
{
$sql='INSERT INTO dofollow (link) VALUES ("'.$value.'")';
mysql_query( $sql, $conn );
}
}
$uinternal = array_values(array_unique($internal));
foreach ($uinternal as $value2) {
$query=mysql_query("select * from joblist3 where link='".$value2."' ");
if ($query) {
$duplicate=0;
$duplicate=mysql_num_rows($query);
if($duplicate==0)
{
$sql='INSERT INTO joblist3 (link) VALUES ("'.$value2.'")';
mysql_query( $sql, $conn );
}
}
}
$ii=$ii+1;
}
// Module 3 ends
sleep(30);
$i=$i++;
}
我几天来一直在尝试对它进行故障排除,周围情况杂乱无章,但是没有运气...
最佳答案
尝试将其包装在do {} while();中;
即:
$i = 1;
do {
echo "some crap $i<br>\n";
} while($i<=50000);
关于php - 虽然PHP运算符与MySQL Truncate不起作用,我们在Stack Overflow上找到一个类似的问题:https://stackoverflow.com/questions/25936672/