Here's the juice: the actual spiderlet.
This script is executed by the crawler. If you read the crawler script you will see that I currently let 10 run at once. This is an arbitrary number.
There is a little chunk in the beginning of the code, talking about "customWords" - this is a little file I use to "augment" certain words. For example, I have sites that deal with things that might be red, might be a bra, might be new etc - and unfortunately, MySQL by default does not TEXT index words that are only 3 chars long. You can modify this and recompile, but it puts a burden on the search engine and suddenly things don't work quite so nicely. So instead, I have a little file that looks like this:
<?php
// I will have been called where arrays searchWords and replaceWords are already in existence.
// I just need to replace words I want special-tagged here. Words that often need tagging are
// 3 letter words or incredibly common words.
$customSearchWords[] = 'aid';
$customReplaceWords[] = 'aidxx';
?>
... where 3 letter words are added to in such a way that they will not be normal words, but I can use that when a surfer types them into a search box. For example, someone types in "aid" and in the background I start searching for "aidxx."
Another thing you'll notice about the code is that I exclude any file that doesn't look like it's going to return HTML to me... images, zips - you name it. Makes the handling of weirdness later easier.
<edit> Sorry - looks like the forum flubbed up the tabbing a bit... you'll need to clean yours manually</edit>
#! /usr/local/bin/php
<?php
//error_reporting(E_ALL);
require_once('./paths.inc');
require_once("$classPath/class.webrequest.php");
require_once("$classPath/class.dbconnection.php");
require_once("$rootPath/localvars.php");
$customWords = "$systemPath/custom.searchWords.php";
if (file_exists($customWords))
{
$customSearchWords = '';
$customReplaceWords = '';
include($customWords);
$GLOBALS['customsearch']['search'] = &$customSearchWords;
$GLOBALS['customsearch']['replace'] = &$customReplaceWords;
}
$db = new dbConnection($db_host, $db_user, $db_password, $db_database);
$GLOBALS['utilDB'] = &$db;
$linkID = $_SERVER['argv'][1];
$spiderletID = $_SERVER['argv'][2];
$sessionID = $_SERVER['argv'][3];
$verbose = $_SERVER['argv'][4];
$quiet = ($verbose) ? false : true;
$spiderlet = new Spiderlet($linkID, $spiderletID, $quiet, $sessionID);
$spiderlet->Crawl();
class Spiderlet{
var $db;
var $http;
var $title;
var $pageAvatar;
var $content;
var $linkID;
var $releaseID;
var $badLinks = array();
var $badChars = array();
var $links = array();
var $quiet;
var $sessionID;
function __construct($myLinkID, $myReleaseID, $qt=true, $sID='')
{
$this->quiet = $qt;
$this->sessionID = $sID;
$this->linkID = $myLinkID;
$this->releaseID = $myReleaseID;
$this->db = &$GLOBALS['utilDB'];
$this->http = new WebRequest();
$this->http->Port = 80;
$this->http->endOnBody = true;
$this->http->timeout = 10;
$this->http->succeedOnTimeout = true;
// Create arrays for later...
$this->badLinks[] = 'file\:';
$this->badLinks[] = 'news\:';
$this->badLinks[] = 'ftp\:';
$this->badLinks[] = 'mailto\:';
$this->badLinks[] = 'telnet\:';
$this->badLinks[] = 'javascript\:';
$this->badLinks[] = 'https\:';
$this->badLinks[] = '\.gif';
$this->badLinks[] = '\.jpg';
$this->badLinks[] = '\.png';
$this->badLinks[] = '\.pdf';
$this->badLinks[] = '\.tar';
$this->badLinks[] = '\.zip';
$this->badLinks[] = '\.rpm';
$this->badLinks[] = '\.mp3';
$this->badLinks[] = '\.aac';
$this->badLinks[] = '\.wmf';
$this->badLinks[] = '\.mov';
$this->badLinks[] = '\.com';
$this->badLinks[] = '\.deb';
$this->badLinks[] = '\.tgz';
$this->badLinks[] = '\.gz';
$this->badLinks[] = '\.rtf';
$this->badLinks[] = '\.doc';
$this->badLinks[] = '\.aiff';
$this->badLinks[] = '\.wav';
$this->badLinks[] = '\.tif';
$this->badLinkStr = implode('|', $this->badLinks);
$this->badChars[] = chr(13);
$this->badChars[] = chr(9);
}
function __destruct() { $this->db->query("delete from crawl_spiderlets where id={$this->releaseID}"); }
function AcceptableLink($linkStr) { return (!preg_match("/{$this->badLinkStr}/", $linkStr)); }
function Crawl()
{
// Get the job from the database...
$this->db->query("select crawl_pages.*, domain from crawl_pages, crawl_domains where crawl_pages.id={$this->linkID} and crawl_domains.id=crawl_pages.siteid");
$this->db->fetchArray();
$siteID = $this->db->row['siteid'];
// Setup the requestor...
$this->http->Host = strtolower($this->db->row['domain']);
$this->http->URL = $this->db->row['url'];
// It's possible the that crawler main put a sessionID into me that I need to pass
// to the page I call as a cookie...
if ($this->sessionID) { $this->http->SetCookie('sessionid', $this->sessionID); }
// Get the page
if (!$buff = $this->http->Get())
{
$db->query("update crawl_pages set crawlstate=-1 where id={$this->linkID}");
exit;
}
$this->buffer = $this->http->Content();
$DNI = preg_match('/rfspider\: donotindex/', $this->buffer);
// It's is an error, get out quick
if (!(strpos($this->buffer, '404') === false))
{
$this->db->query("update crawl_pages set crawlstate=-1, pagetitle='', searchblob='{$this->buffer}' where id={$this->linkID}");
exit;
}
// Distill the page into my content...
$this->ExtractTitle();
$this->ExtractPageAvatar();
$this->GatherLinks();
$this->FinalCleaning();
// Update the database now...
$now = date('Y-m-d H:i:s', time());
$title = mysql_escape_string($this->title);
$this->db->query("update crawl_pages set crawlstate=0, crawlfound=1, lastcrawl='$now', " .
"pagetitle='$title', avatar='{$this->pageAvatar}', searchblob='{$this->content}' where id={$this->linkID}");
// Now insert links. The table is UNIQUE indexed, so it will fail if the page already exists...
for ($i=0; $i<count($this->links); $i++)
{
// Note that "to work" values of crawlstate and crawlfound are set by the DB...
$newPage = $this->links[$i];
if (($newPage == $this->http->URL) || ($newPage <= ' ')) { continue; }
$this->db->query("insert into crawl_pages(siteid, url, referrer) values($siteID, '{$this->links[$i]}', {$this->linkID})", true);
}
// Now this looks rather bizarre, but if <this page> didn't want to
// be in the index, eliminate it. It would, however, have contributed
// to the links to-do list by now...
if ($DNI) { $this->db->query("delete from crawl_pages where id={$this->linkID}"); }
}
function ExtractPageAvatar()
{
// The pageavatar is a graphic that can be used for search results.
// If it's in the page, it'll be like this: <!-- pageavatar: /graphics/afile.jpg -->
preg_match('/pageavatar:[ ]*([^ ]*)/i', $this->buffer, $matches);
if ($matches[1]) { $this->pageAvatar = mysql_escape_string($matches[1]); }
}
function ExtractTitle()
{
$this->title = '[ No Page Title ]';
preg_match('/\<title\>([^<]*)/i', $this->buffer, $matches);
if ($matches[1]) { $this->title = $matches[1]; }
$searchWords = &$GLOBALS['customsearch']['search'];
if ($searchWords) { $this->title = str_replace($searchWords, &$GLOBALS['customsearch']['replace'], $this->title); }
}
function FinalCleaning()
{
$regex = (strpos($this->buffer, '<!-- startcontent')) ? '/\<\!\-\- startcontent \-\-\>(.*)\<\!\-\- endcontent/ismU' : '/\<body(.*)$/ismU';
preg_match($regex, $this->buffer, $matches);
$cleanArr = array();
$cleanArr[] = '/(\<script.*\<\/script\>)/imsU';
$cleanArr[] = '/(\<style.*\<\/style\>)/imsU';
$cleanArr[] = '/(\<\!\-\- hide.*endhide \-\-\>)/imsU';
$replArr = array(' ', ' ', ' ');
$this->buffer = preg_replace($cleanArr, $replArr, $this->buffer);
$this->buffer = strip_tags(str_replace($this->badChars, '', $this->buffer));
$searchWords = &$GLOBALS