|
webinfoguy25
|
 |
« Reply #15 on: October 21, 2009, 11:10:56 AM » |
|
OK, probably not the simplest, but certainly not a complicated thing. I have several WH sites that need a sitemap and searchability, but I am no longer in control of the content - the clients, or PinkHat, or any number of sources might contribute/change the content of a site. So I have a cron job that runs nightly and executes a little single-site crawler, then places searchable content into a database as well as creating the content for a static sitemap. It's really, really simple, but might offer the fledgling spider author some ideas. Here is the most basic way of calling it. The print_r at the end is just a really simple way of seeing everything that the object contains when it's done. Note that I don't do any real checking of the site, nor is it capable of handling relative URLs (I don't use any for my sites). $crawler = new simpleCrawler(); $crawler->domain = 'www.mydomain.com'; $crawler->crawl(); print_r($crawler);
Here is the code: <?php
class simpleCrawler {
private $todo, $done, $pageBuff, $currentURL;
public $domain, $site, $pages, $debug, $debugMax;
private function compileSite()
{
foreach($this->pages as $page)
{
foreach($page['internal'] as $url=>$dummy) $iTemp[$url] = true;
foreach($page['external'] as $url=>$dummy) $eTemp[$url] = true;
}
foreach($iTemp as $url=>$dummy) $this->site['internal'][] = $url;
foreach($eTemp as $url=>$dummy) $this->site['external'][] = $url;
sort($this->site['internal']);
sort($this->site['external']);
}
private function debug($msg) { if ($this->debug) echo "$msg\n"; }
private function extractContent()
{
// I do this because I like seeing the searchable before the raw when I print_r
$this->pages[$this->currentURL]['searchable'] = false;
$ptr = stripos($this->pageBuff, '<body');
$buff = trim(strip_tags(substr($this->pageBuff, $ptr, strlen($this->pageBuff))));
$buff = trim(str_ireplace(array(' ', chr(10), "\t"), ' ', $buff));
while (strpos($buff, ' ')) $buff = str_replace(' ', ' ', $buff);
$this->pages[$this->currentURL]['content'] = $buff;
$buff = trim(str_replace(array('-', '_', '.', ':', '/', '\\', ','), ' ', $buff));
$buff = preg_replace('/[^A-Z0-9 \r]/i', '', $buff);
while (strpos($buff, ' ')) $buff = str_replace(' ', ' ', $buff);
$words = explode(' ', $buff);
$outWords = array();
foreach($words as $word)
{
if (strlen($word) < 4) continue;
if (preg_match('/^[0-9]*$/', $word)) continue;
$outWords[] = $word;
}
$this->pages[$this->currentURL]['searchable'] = implode(' ', $outWords);
}
private function extractTitle()
{
preg_match('/<title>(.*)<\/title>/i', $this->pageBuff, $parts);
$this->pages[$this->currentURL]['title'] = $parts[1];
}
private function extractURLs()
{
$this->pages[$this->currentURL]['internal'] = array();
$this->pages[$this->currentURL]['external'] = array();
preg_match_all('/<a href="([^"]*)/i', $this->pageBuff, $thisArr);
foreach($thisArr[1] as $url)
{
$url = trim($url);
if ($url[0] == '/') $url = "http://{$this->domain}$url";
if (
(preg_match('~^https://~i', $url)) or
(strpos($url, '?')) or
(preg_match('/^mailto/i', $url)) or
(preg_match('/\.pdf$/i', $url))
)
continue;
if (!preg_match("~http://{$this->domain}~i", $url))
{
$this->pages[$this->currentURL]['external'][$url] = true;
$this->debug("DENY $url");
continue;
}
$this->pages[$this->currentURL]['internal'][$url] = true;
if (!$this->done[$url])
{
if (!in_array($url, $this->todo))
{
$this->todo[] = $url;
$this->debug("TODO $url");
}
} else $this->debug("DONE $url");
}
}
public function crawl()
{
if (!$this->domain)
throw new Exception('simpleCrawler: You cannot crawl without specifying a domain.');
$this->pages = array();
$this->site = array();
$this->todo = array();
$this->done = array();
$this->todo[] = "http://{$this->domain}/";
$counter = 0;
while (count($this->todo))
{
$thisURL = $this->currentURL = array_shift($this->todo);
$this->debug("\n\nCRAWL $thisURL");
$this->done[$thisURL] = true;
$this->pageBuff = file_get_contents($thisURL);
$this->pages[$this->currentURL]['url'] = $thisURL;
$this->extractTitle();
$this->extractURLs();
$this->extractContent();
$ptr = count($this->urlList);
if ($this->debug)
if ($this->debugMax)
if ($counter++ > $this->debugMax)
break;
}
$this->compileSite();
}
}
Enjoy! <add> Added so that VSloathe would be more impressed with the output:  After the crawl is done, you have $crawler->pages which is an array of arrays containing the internal & external links on each page, as well as each page's title, display content and searchable content for the database. You also have $crawler->site['internal'] and $crawler->site['external'] which contains all the internal links and external links on the entire site, most helpful for building a sitemap. </add> Howcome you don't use relative URLS?
|