Launch Class:
<?
php
/* -------------------------------------------------------------------------
*
* Google SERP Scraper
*
* - Purpose: Launches requests and collects responses
*
* - Usage: Create new object with google domain to scrape and
* any changes to valid search string paramaters.
*
* Pass query and any additional options to 'search' method.
*
* - Returns: Google_Serp_ResultSet Object
*
* - Author: gatecrasher1981@gmail.com
*
* Any feedback welcome.
*
* --------------------------------------------------------------------------*/
Class Google_Serp_Scraper
{
// -----------------
// Properties
// -----------------
// Public
public $baseDomain;
public $validOptions;
// Protected
protected $resultsTally;
protected $results;
// -----------------
// Constructor
// -----------------
public function __construct($domain, array $options = array())
{
// Valid Search Paramaters :: Format: $key = search param; $value = friendly name
$validOptions = array(
'hl' => 'interfaceLanguage', // Validate
'btnG' => 'btnG',
'num' => 'results',
'oe' => 'outputEncoding', // Validate
'ie' => 'inputEncoding', // Validate
'qdr' => 'dateFilter',
'lr' => 'language', // Validate
'cr' => 'country', // Validate
'safe' => 'safeFilter', // Validate
'filter' => 'duplicateFilter', // Validate
'start' => 'start'
);
$this->validOptions = array_merge($validOptions, $options);
$this->validateDomain($domain);
$this->results = $this->createResultsContainer();
}
// -----------------
// Methods
// -----------------
// -----------------
// Search Query
// -----------------
public function search($query, $requestedResults = 100, $options = array())
{
// Set default options - minimum options required to get search to run
$defaultOptions = array(
'interfaceLanguage' => 'en',
'btnG' => 'Google Search'
);
$options = array_merge($defaultOptions, $options);
if(empty($query))
{
throw new exception('Query string must not be empty!');
}
$this->validateOptions($options);
$pagesRequired = $this->getPagesRequired($requestedResults, $options);
while($pagesReceived < $pagesRequired && $this->resultsTally >= $expectedResultsTally)
{
if($pagesReceived > 0)
{
$options['start'] = $requestedResults * $pagesReceived + 1;
}
$queryString = $this->buildSearchString($query, $options);
$resultPage = $this->sendSearch($queryString);
$this->processResultsPage($resultPage);
sleep(rand(5,15));
$pagesReceived++;
$expectedResultsTally = $pagesReceived * $options['results'];
}
require_once('../libs/GoogleSerpScraper/google_serp_resultset.php
');
return new Google_Serp_ResultSet($this->results);
}
// -----------------
// Get Number of Pages Required
// -----------------
protected function getPagesRequired($requestedResults, array $options)
{
if(empty($options['results']))
{
return ceil($requestedResults / 10);
}
return ceil($requestedResults / $options['results']);
}
// -----------------
// Validate Options
// -----------------
protected function validateOptions(array $options)
{
// Check there are no invalid options passed
$difference = array_diff(array_keys($options), $this->validOptions);
if($difference)
{
throw new exception('Invalid option keys were passed');
}
// Validate number of results requested per page
if( isset($options['results']) && ($options['results'] < 1 || $options['results'] > 100) )
{
throw new exception('Number of results per page must be between 1 - 100');
}
// Validate date option if set
if( isset($options['dateFilter']) && preg_match('/^(d|m|y)[0-9]+$/', $options['dateFilter']) == 0 )
{
throw new exception('Date Filter Option must be expressed as either d, m or y, followed by a number');
}
}
// -----------------
// Validate Domain
// -----------------
protected function validateDomain($domain)
{
// Sloppy link check, apply external link object validation where available
if(empty($domain) || !stristr($domain, 'google'))
{
throw new exception('A valid google domain to search must be supplied.');
}
$this->baseDomain = $domain;
}
// -----------------
// Process Results Page
// -----------------
protected function processResultsPage($results)
{
$resultPage = new DOMDocument();
if(!@$resultPage->loadHTML($results))
{
throw new exception('Failed to load HTML from result page into DOM object') ;
}
$xpath = new DOMXpath($resultPage);
// Set estimated total results
$this->results->getElementsByTagName('EstimatedTotalResults')->item(0)->nodeValue = $this->parseEstimatedTotalResults($xpath);
// Isolate results
$results = $xpath->query('//div[@id="res"]//div[@class="g"]');
// Parse out each result
foreach($results as $result)
{
$resultNode = $this->results->createElement('Result');
$resultNode->appendChild( $this->parseTitle($result) );
$resultNode->appendChild( $this->parseLink($result) );
$resultNode->appendChild( $this->parseSummaryText($result, $xpath) );
$resultNode->appendChild( $this->parseCacheLink($result, $xpath) );
$this->results->getElementsByTagName('ResultSet')->item(0)->appendChild( $resultNode );
}
$this->resultsTally = $this->results->getElementsByTagName('ResultSet')->length;
}
// -----------------
// Parse Estimated Total Results
// -----------------
protected function parseEstimatedTotalResults(DOMXPath $xpath)
{
$estimatedTotalResults = $xpath->query('//table[@class="t bt"]//font//b[3]');
return $estimatedTotalResults->item(0)->nodeValue;
}
// -----------------
// Parse Title
// -----------------
protected function parseTitle(DOMNode $result)
{
$title = htmlentities($result->getElementsByTagName('h2')->item(0)->nodeValue);
return new DOMElement('Title', $title);
}
// -----------------
// Parse Link
// -----------------
protected function parseLink(DOMNode $result)
{
$url = htmlentities($result->getElementsByTagName('h2')->item(0)->getElementsByTagName('a')->item(0)->attributes->getNamedItem('href')->nodeValue);
return new DOMElement('URL', $url);
}
// -----------------
// Parse Summary
// -----------------
protected function parseSummaryText(DOMNode $result, DOMXPath $xpath)
{
$summary = $xpath->query('.//font[@size = "-1"]', $result);
foreach($xpath->query('.//font[@size = "-1"]//a | //span', $result) as $deletes)
{
$replaceArray[] = $deletes->nodeValue;
}
$summary = htmlentities(str_replace( $replaceArray, '', $summary->item(0)->nodeValue ));
return new DOMElement('Summary', $summary);
}
// -----------------
// Parse Cache Link
// -----------------
protected function parseCacheLink(DOMNode $result, DOMXPath $xpath)
{
$cacheLinkResults = $xpath->query('table//nobr/a[. = "Cached"]/@href', $result);
$cacheURL = htmlentities($cacheLinkResults->item(0)->nodeValue);
return new DOMElement('CacheURL', $cacheURL);
}
// -----------------
// Build Search String
// -----------------
protected function buildSearchString($query, $options)
{
$params['q'] = (string) $query;
foreach($options as $optionKey => $optionValue)
{
$translateKey = array_search($optionKey, $this->validOptions);
$params[$translateKey] = $optionValue;
}
// URL encodes and glues together $params array
return http_build_query($params);
}
// -----------------
// Send Query
// -----------------
protected function sendSearch($queryString)
{
$url = $this->baseDomain . 'search?' . $queryString;
// --------------------------!
// INSERT WEB REQUEST TO $url HERE
// --------------------------!
// --------------------------!
// Check if the Web request was successful
// --------------------------!
if(! )
{
throw new exception('Web request to' . $url . 'failed.');
}
// --------------------------!
// Return string containing HTML received
// --------------------------!
return
}
// -----------------
// Create Results Container
// -----------------
protected function createResultsContainer()
{
$dom = new DOMDocument('1.0', 'UTF-8');
$dom->appendChild( $dom->createElement('EstimatedTotalResults') );
$dom->appendChild( $dom->createElement('ResultSet') );
return $dom;
}
}