The Cache: Technology Expert's Forum
 
*
Welcome, Guest. Please login or register. September 18, 2019, 12:43:49 PM

Login with username, password and session length


Pages: [1]
  Print  
Author Topic: Class: MultiCurl  (Read 8558 times)
winterpark
n00b
*
Offline Offline

Posts: 2


View Profile
« on: July 29, 2007, 06:03:34 AM »

Here's my Curl class that uses the curl_multi interface as well as the regular one.

This has been refactored a couple times already and I'm almost happy with it now. No documentation available yet, so you will have to look through the code. If you use it, please share any hacks or suggestions.

basic usage:
Code:
<?php
// $data will be the $c->data[$i] array passed by reference
function googleParse($data) {
$words json_decode($data['body']);
$words[1][] = $words[0];
$data['words'] = words[1];
}
$c = new Curl;
$url 'http://google.com/complete/search?output=firefox&client=firefox&qu='urlencode('sexy time');
$curlOptions = array(
CURLOPT_TIMEOUT => 10,
CURLOPT_REFERER => 'http://google.com/'
);

// If you use $requests['googSug'] here, then you need to access $c->data['googleSug']
$requests[] = $c->addRequest($url$curlOptions'googleParse');

$c->perform($requests);

print_r($c->data[0]['words']);

?>



Code:
<?php
/*
****
**** COPYRIGHT 2007 webuildspam.com
**** All rights reserved.
**** You may not redistribute this file commercially or free of charge.
****
*/
class Curl {

/* VERY IMPORTANT
** MULTI MODE: true,false
** If you are having problems with curl_multi, change to false and it will work with 
** non-multi curl. You will still be able to use $this->addRequest() and $this->perform() 
** to execute multiple requests (however they won't be executed concurrently).
** Sidenote: Also useful when you need to run requests slower.
*/
public $multiMode true;

// Max simultaneous requests, if there are more in the request pool it runs chunk()
public $maxParallelRequests 50;

// Delay between requests in seconds, only works with $multiMode set to false.
public $delayBetweenSingle 0;

/*
** User-Agents
*/
public $useragentsFilePath ''; // file (one per line)
public $useragentsRotate false; // Auto rotation
public $useragents = array(); // You can add directly to this array

/*
** Proxies
*/
public $proxiesFilePath ''; // file (one per line, IP:PORT:USER:PASS)
public $proxiesDelimiter ','; // delimiter for the file
public $proxiesRotate false; // Auto rotation
//public $proxiesMinSpeed = 60; // Set minimum speed for the proxies in seconds, if response takes more they are removed
//public $proxiesRetries = 0; // Retry X times with a new proxy on error
public $proxies = array(); // You can add directly to this array

/*
** Interface
*/
public $interfacesAutoDiscovery false// tries to autodiscover the available interfaces using `ifconfig`
public $interfacesFilePath ''; // file (one per line, 0.0.0.0)
public $interfacesRotate false; // Auto rotation
public $interfaces = array(); // You can add directly to this array

/*
** Cookies
*/
protected $handleCookies true// Handle cookies, set to false if you want to use COOKIEJAR or COOKIEFILE

/*
** Debug
*/
public $errors = array(); // Curl errors


/*
** Request pool
** request: 'url', 'options', 'callback', 'callbackArgs'
**
** 'url': can be either a single URL or an array of URLs
** 'options': CURL options
** 'callback': func_name or array( obj, 'method_name')
** 'callbackArgs': array of arguments for the callback
**
** the callback receives a response data array (by reference, $this->data[$i]) as the first arg,  then anything you pass it with 'callbackArgs'
** ex. function stripHtml(&$data) { $data['plainText'] = strip_tags($data['body']); }
** -- the above will add ['plainText'] to $this->data;
*/
public $requests = array();

/*
** Response data associative arrays
** response: 'body', 'head', 'headSize', 'cookies', 'httpCode', 'effectiveUrl'
*/
public $data = array();


/* 
** Internal variables
*/

// DEFAULT CURL OPTIONS 
// Will override $curlOptions in addRequest()
// DO NOT CHANGE 'CURLOPT_RETURNTRANSFER' and 'CURLOPT_HEADER' UNLESS YOU KNOW WHAT YOU ARE DOING
public $defaultOptions = array(  
CURLOPT_RETURNTRANSFER => trueCURLOPT_HEADER => true,
CURLOPT_FOLLOWLOCATION => trueCURLOPT_MAXREDIRS => 20);

protected $ch; // Single curl handle
protected $mh; // Multi curl handle
protected $chPool; // Pool of handles for the multihandle
protected $activeConnection;
protected $mrc;

protected $chunked false;
protected $chunks;

protected $redirects = array();

protected $tempHeaders = array();


/*
** Constructor and destructor
*/

public function __construct() {

set_time_limit(0);

// Default callbacks (IMPORTANT)
$this->defaultOptions[CURLOPT_HEADERFUNCTION] = array($this'headerCallback');
//$this->defaultOpts[CURLOPT_WRITEFUNCTION] = array($this, 'responseCallback');

// Load proxies,useragents and interfaces
if($this->proxiesRotate && $this->proxiesFilePath && is_readable($this->proxiesFilePath)) {
$arr file($this->proxiesFilePath);
foreach(array_keys($arr) as $in) {
$p explode($this->proxiesDelimitertrim($arr[$in]));

$this->proxies[] = array(
'ip'=> $p[0],
'port'=> $p[1],
'user'=> $p[2],
'pass'=> $p[3]
);

}
}

if($this->useragentsRotate && $this->useragentsFilePath && is_readable($this->useragentsFilePath)) {
$arr file($this->useragentsFilePath);
foreach(array_keys($arr) as $in) {

$this->useragents[] = trim($arr[$in]);
}
}

if($this->interfacesRotate && $this->interfacesAutoDiscovery) {
$arr $this->discoverInterfaces();
$this->interfaces $arr;

} elseif($this->interfacesRotate && $this->interfacesFilePath && is_readable($this->interfacesFilePath)) {
$arr file($this->interfacesFilePath);
foreach(array_keys($arr) as $in) {

$this->interfaces[] = trim($arr[$in]);
}
}
}

public function __destruct() {

}

/*
** Single HTTP methods
*/
public function get($url$curlOpts ''$callback=''$callbackArgs='') {

$curlOpts[CURLOPT_HTTPGET] = true;

$a[] = $this->addRequest($url,$curlOpts,$callback$callbackArgs);
$this->perform($a);

return end($this->data);
}

public function post($url$vars$curlOpts=''$callback=''$callbackArgs='') {
$curlOpts[CURLOPT_POST] = true;
if(!is_array($vars[0])) {
foreach(array_keys($vars) as $v) {
$curlOpts[CURLOPT_POSTFIELDS] = $this->prepareVars($vars);
$a[] = $this->addRequest($url,$curlOpts,$callback,$callbackArgs);
}
} else {
$curlOpts[CURLOPT_POSTFIELDS] = $this->prepareVars($vars);
$a[] = $this->addRequest($url,$curlOpts,$callback,$callbackArgs);
}

$this->perform($a);

return end($this->data);
}

/*
** Multiple HTTP methods
*/
public function addRequest($url$curlOptions '',$callback=''$callbackArgs=''$metaData='') {

//$p = parse_url($url);

$r = array( 
'url' => $url
'options' => $o,
);

if( is_callable($callback)) {
$r['callback'] $callback;
$r['callbackArgs'] = $callbackArgs;
}

if(is_array($metaData))
$r['metaData'] = $metaData;

$request $r;

return $request;
}

public function addArray($urls$curlOptions '',$callback=''$callbackArgs='') {
if(!is_array($curlOptions[0])) {
$curlOptions = array($curlOptions);
}
if(is_array($url)) {
foreach(array_keys($url) as $i){
//$p = parse_url($url[$i]);

$o = (is_array($curlOptions[$i])) ? $curlOptions[$i] : $curlOptions[0];

$r = array( 
'url' => $url[$i], 
'options' => $o,
);
if( is_callable($callback)) {
$r['callback'] $callback;
$r['callbackArgs'] = $callbackArgs;
}

$requests[] = $r;
}
}
return $requests;
}

public function perform($requests) {
$k count($requests);
$this->requests $requests;
if($k ) {
//If there's only one request drop down to single mode
if($k == 1) {
$this->multiMode false;
}
// Chunk requests if they are more than allowed, not already chunked and $this->multiMode is true
if($this->multiMode && !$this->chunked && $this->maxParallelRequests count($this->requests)) {
$this->chunk$maxParallelRequests );
}

$this->chPool = array();

// if not chunked perform() once 
if(!$this->chunked) {
$this->makeHandles($this->requests);
$this->performMulti();
}
// else perform() for each chunk
else 
{
foreach(array_keys($this->chunks) as $i) {
// assign a chunk to be perform()ed
$requests $this->chunks[$i];
// initialise the single handles, stored in $this->chPool
$this->makeHandles($requests);
// perform requests in the pool
$this->performMulti();
// cleanup
$this->perforMultiCleanup();
}
}
$this->requests = array();
}
}
/*
** General methods
*/
public function chunk($chunkSize) {
if($chunkSize 0) {
$k count($this->requests);
if($k $chunkSize) {
$this->chunked true;
$this->chunks array_chunk($this->requests$chunkSizetrue);
}
}
}

public function prepareVars$vars ) {
if( is_array($vars) && !empty($vars)) {
$str '';
foreach( array_keys($vars) as $k)
$str .= urlencode($k).'='.urlencode($vars[$k]).'&';

$str substr($str0, -1);
return $str;
} else { 
return false;
}
}

public function discoverInterfaces() {

$a shell_exec'ifconfig -a' );
//$a .= 'lala addr:127.123.123.123 addr:122.234.354.34 lala addr:10.10.10.10 addr:100.12.12.12 ';
preg_match_all'@addr:(?=([0-9.]+))(?!(?:10\.|127\.|172\.[1-3][6-9_0]\.|192\.|169\.254\.[0-9.]+))@'$a$m);
return $m[1];
}

/******
** Internal methods
******/
protected function setDefaultOptions(&$ch) {

// Rotate properties
if( $this->useragentsRotate && !empty($this->useragents) ) {

$options[CURLOPT_USERAGENT] = $this->rotateProperty('useragents');
}

if( $this->proxiesRotate && !empty($this->proxies) ) {

$proxy $this->rotateProperty('proxies');

$options[CURLOPT_PROXYPORT] = $proxy['port'];
$options[CURLOPT_PROXY] = $proxy['ip'];
if( isset($proxy['user']) ) {
$options[CURLOPT_PROXYUSERPWD] = $proxy['user'].':'.(isset($proxy['pass']))?$proxy['user']:'';
}
}

if( $this->interfacesRotate && !empty($this->interfaces) ) {

$interface $this->rotateProperty('interfaces');
$options[CURLOPT_INTERFACE] = $interface;
}

$this->setOptions($ch$options);
$this->setOptions($ch$this->defaultOptions);
}


protected function setOptions(&$ch$curlOptions) {
if(is_array($curlOptions) && !empty($curlOptions))
curl_setopt_array($ch$curlOptions);
}

protected function makeHandles($requests) {
foreach(array_keys($requests) as $i) {

// Create handle into $this->chPool
if( $this->chPool[$i] = curl_init$requests[$i]['url'] )) {

// Apply default options to all requests
$this->setDefaultOptions($this->chPool[$i]);

// Apply individual options if any
if(isset($requests[$i]['options']))
$this->setOptions($this->chPool[$i], $requests[$i]['options']);

if(isset($requests[$i]['metaData']))
$this->data[$i] = $requests[$i]['metaData'];
}
}
}

protected function addHandle(&$ch) {

curl_multi_add_handle($this->mh$ch);
}

protected function removeHandle(&$ch) {

curl_multi_remove_handle($this->mh$ch);
}
protected function performMulti() {

// Mode is single curl
if($this->multiMode === false
{
foreach(array_keys($this->chPool) as $i ) {

if($this->delayBetweenSingle 0)
sleep($this->delayBetweenSingle);

$response curl_exec($this->chPool[$i]);

if ( ($err curl_error($this->chPool[$i])) == ) {

$this->parseResponse($i$response);
} else {

$ern curl_errno($this->chPool[$i]);
$this->errors[$i]= array( 'num' => $ern'msg' => $err );
}
curl_close($this->chPool[$i]);
}
// Mode is multi curl
} else {
// initialise multihandle
$this->mh curl_multi_init();
array_walk($this->chPool, array($this'addHandle'));
// Start performing the requests
do { $this->mrc curl_multi_exec($this->mh$this->activeConnection); }
while ($this->mrc == CURLM_CALL_MULTI_PERFORM);

while ($this->activeConnection && $this->mrc == CURLM_OK) {
// Wait for network
if (curl_multi_select($this->mh) != -1) {
// pull in any new data, or at least handle timeouts
do {$this->mrc curl_multi_exec($this->mh$this->activeConnection); }
while ($this->mrc == CURLM_CALL_MULTI_PERFORM);
}
}

if ($this->mrc != CURLM_OK) {
// echo "Curl multi read error $this->mrc\n"
}

foreach(array_keys($this->chPool) as $i ) {

if ( ($err curl_error($this->chPool[$i])) == '' ) {
// Retrieve data
$response curl_multi_getcontent($this->chPool[$i]);
$this->parseResponse($i,$response);
  } else {
 
$ern curl_errno($this->chPool[$i]);
$this->errors[$i]= array( 'num' => $ern'msg' => $err );
}
}
}
}

protected function performMultiCleanup() {

foreach(array_keys($this->chPool) as $i) {
curl_multi_remove_handle($this->mh$this->chPool[$i]);
curl_close($this->chPool[$i]);
}
curl_multi_close($this->mh);
}

protected function headerCallback($ch$head) {
// finds the key for our $ch which is a resource
// the third paramater 'true', makes it check the type of needle
$i array_search($ch$this->chPooltrue);

$len strlen($head);

$this->data[$i]['headSize'] += $len;


if( $this->headerFilter($i$head) ) {

        
return $len;
        
} else {
        
return -1// stops the request during reading the header
        
}
    
}

    
protected function headerFilter($i$head) {
    
$abort=false;
    
$head trim($head);
 
if( !empty($head)) { 
 

$this->data[$i]['head'][] = $head

        
$p strpos($head,':');
        
$type strtolowersubstr($head,0,$p) );
        
$content trim(substr($head,$p+1));
        

        

        
switch($type) {
        
// Cookies
        
case 'set-cookie':
        
if( $this->handleCookies) {
        
$cookie =  $this->parseCookie($content);
$this->data[$i]['cookies'][] = $cookie;
}
break;
        
// Redirects
        
case 'location':
        
$this->redirect[$i][]= $content
        
break;
        
}
        
} else {
        

        
}
        

        
// Set $abort to true to stop the request while reading the header
    
if($abort) {
    
return false;
    
} else {
    
return true;
    
}
    
}
    

    
//
    
// This receives chunks of the whole http response (including headers). You must return the length of each chunk or an error will happen.
    
// You can do stuff here if you know what you are doing, otherwise leave it alone.
    
//
//protected function responseCallback($ch, $response) {
// return strlen($response);
//}

protected function parseResponse($i$resp) {

$this->data[$i]['body'] = trim(substr($resp,$this->data[$i]['headSize']-1));

if(isset($this->requests[$i]['callback'])) {
$args = array(&$this->data[$i]);
if(is_array($this->requests[$i]['callbackArgs'])) {
array_unshift($this->requests[$i]['callbackArgs'], &$this->data[$i]);
$args $this->requests[$i]['callbackArgs'];
}
call_user_func_array($this->requests[$i]['callback'], $args );
}
}

protected function parseCookie($str) {
if( strpos($str';') === false) {
$cdata explode('=',$str);
$parts['name'] = trim($cdata[0]);
$parts['value']= trim($cdata[1]);
} else {
$cookiesplit explode';'$str );
$parts = array();

foreach( $cookiesplit as $data ) {
$cdata explode'='$data );
$cdata[0] = trim$cdata[0] );
switch($cdata[0]) {
case 'expires':
$cdata[1] = strtotime$cdata[1] );
break;
case 'secure':
$cdata[1] = true;
break;
}
if( in_array$cdata[0], array( 'domain''expires''path''secure''comment' ) ) ) {
$parts[trim($cdata[0])] = $cdata[1];
}
else {
$parts['name'] = $cdata[0];
$parts['value']= $cdata[1];
}
}
}

if( !empty($parts['name']) ) {
return $parts['name'].'='.$parts['value'];
} else {
return false;
}

}

protected function rxCallback() {}


protected function rotateProperty($property) {

$p $this->$property[array_rand($this->$property)];

return $p;
}
}


?>
« Last Edit: July 29, 2007, 06:14:23 AM by winterpark » Logged

No links in signatures please
winterpark
n00b
*
Offline Offline

Posts: 2


View Profile
« Reply #1 on: July 29, 2007, 06:11:24 AM »

Apparently you should move this to the PHP code section.
Logged

No links in signatures please
meme
n00b
*
Offline Offline

Posts: 6


View Profile
« Reply #2 on: November 30, 2007, 09:45:10 AM »

Perkiset, delete this post and I'll make a new one with the new stuff. (I'm winterpark, just lost pswd/forgot username so I made a new one)
Logged

No links in signatures please
nutballs
Administrator
Lifer
*****
Offline Offline

Posts: 5627


Back in my day we had 9 planets


View Profile
« Reply #3 on: November 30, 2007, 10:41:53 AM »

why got a different IP then?
Logged

I could eat a bowl of Alphabet Soup and shit a better argument than that.
1414mark1414
n00b
*
Offline Offline

Posts: 4


View Profile
« Reply #4 on: January 04, 2012, 09:39:47 AM »

Hello,

I'm sorry for bumping an old thread but could anyone tell me if this class still works and show me an example because the example given in the first post
doesn't seem to be working

I hope this class still works because it seems to have everything I need; delay, proxies basically this class seems godly

thanks in advance,

1414mark1414
Logged
perkiset
Olde World Hacker
Administrator
Lifer
*****
Offline Offline

Posts: 10096



View Profile
« Reply #5 on: January 04, 2012, 10:09:22 AM »

Welcome to the cache, 1414.

I've never implemented this class myself, so I don't know what it's behavior should have been. What exactly is happening though? Perhaps we can walk through it together.
Logged

It is now believed, that after having lived in one compound with 3 wives and never leaving the house for 5 years, Bin Laden called the U.S. Navy Seals himself.
1414mark1414
n00b
*
Offline Offline

Posts: 4


View Profile
« Reply #6 on: January 04, 2012, 10:52:54 AM »

Well I'm trying to curl a couple of links which are in json while using proxies

I tried the example that winterpark  posted but it doesn't seem to work

and personally I have no idea how to execute a multi curl with this class

Code:
$c = new Curl;

$url = array(
'http://google.com',
'http://google.com',
'http://google.com',
'http://google.com'
);

$curlOptions = array(
CURLOPT_TIMEOUT => 10,
CURLOPT_REFERER => 'http://google.com/'
);


I have no idea what to do next :S
btw I'm not a php noob but I am a curl noob Tongue

Also thanks for the fast reply I appreciate it!
Logged
perkiset
Olde World Hacker
Administrator
Lifer
*****
Offline Offline

Posts: 10096



View Profile
« Reply #7 on: January 04, 2012, 11:07:19 AM »

No worries man, it's great having a new member that's not a 'bot  ROFLMAO

So first off if that's really similar to your array, then I have no doubt that some of the trouble is Google stopping you. I'm pretty certain that they are not keen about multiple hits from the same address in that way ... especially through a proxy. If you're scraping Google results you're going to do some things way more tricky than that to do it.

Nutballs, BTW, is considerably better at scraping results anymore than I am, if that's your goal. If yes, post and I'll make sure he weighs in.

Have you tried something considerably simpler, like hitting 2 different websites from unafraid hosts? I dunno, just google something in truck parts and another in plumbing or something - 2 sites that could give a shit, and then try to hit them both. Also, start without a proxy. Proxies, as you can read from here and the Syndk8, are notorious problem causers when doing things like this.

I'd also like to know more about your intentions. MultiCurl is a very specific way to tackle a very specific challenge. If we know more about your goal, we may be able to suggest alternative methods.
Logged

It is now believed, that after having lived in one compound with 3 wives and never leaving the house for 5 years, Bin Laden called the U.S. Navy Seals himself.
1414mark1414
n00b
*
Offline Offline

Posts: 4


View Profile
« Reply #8 on: January 04, 2012, 11:29:55 AM »

haha I am actually testing without proxies also I'm testing on a single curl and not multi

Maybe I'm not getting the syntax right ?? :S or the order of operation I'm totally lost

here's the code I've just tested
Code:
<?php

$c 
= new Curl;
$url 'http://break.com';
$curlOptions = array(
CURLOPT_TIMEOUT => 10,
CURLOPT_REFERER => 'http://break.com'
);
$request $c->addRequest($url,$curlOptions);
$c->perform($request);

print_r($c->data);

?>


"this is just me testing, break has nothing to do with my project/goal"
basically what I'm trying to do is to multi curl a bunch of links which are in json and contain different numbers, then afterwards place them in database for future use

there's three possible problems here: syntax, order of operation or the class contains errors and doesn't work anymore
Logged
perkiset
Olde World Hacker
Administrator
Lifer
*****
Offline Offline

Posts: 10096



View Profile
« Reply #9 on: January 04, 2012, 11:41:03 AM »

In this case then, must you cURL? Have you looked at my webRequest class?

http://www.perkiset.org/forum/php/perks_new_webrequest_class-t616.0.html

since it's all PHP and you're not a n00b, that may be more immediately understandable. Note that the proxy stuff was never really tested at the end, so it may not *perfectly* suit your needs, but it may be more malleable than cURL, which is a pretty big hammer. And honestly, if it's just a simple pull, then using file_get_contents('http:// ... ') is another completely valid way of doing it.

My spider uses webRequest. Here's the interesting thing: Windows programmers are pushed towards moving things into a single instance (I dunno if this fits you). Even Linux people are often moved toward threading from a single instance rather than distributed tasking. Essentially my spider puts URLs that I need to scrape into a database, then spiderlets "check out" one of the URLs and execute in a standalone process. This is more memory heavy than true threading, but it takes advantage of the system multitasking very efficiently.

You can also fork a process if you desire - it's another way of multitasking and might be additionally more understandable.
http://www.perkiset.org/forum/php/forked_php_daemon_example-t474.0.html

Thoughts?
Logged

It is now believed, that after having lived in one compound with 3 wives and never leaving the house for 5 years, Bin Laden called the U.S. Navy Seals himself.
1414mark1414
n00b
*
Offline Offline

Posts: 4


View Profile
« Reply #10 on: January 04, 2012, 12:10:14 PM »

The reason I do need to use proxies is because after a number of requests to the server that I'm screen scraping  it starts to send empty requests
and also since I was cheap Cry I got proxies that don't allow sockets

Anyhow I'm going to take a break and then imma check the other method you proposed

thanks again!
Logged
perkiset
Olde World Hacker
Administrator
Lifer
*****
Offline Offline

Posts: 10096



View Profile
« Reply #11 on: January 04, 2012, 12:18:08 PM »

You may consider getting several cheap hosts rather than just one, then put your spider code on all of them being served from a central DB (I did this for many years, works friction' great). Then you'd have multiple vectors without the need of proxies. Although depending on how cheap you are this may still be too expensive ...
Logged

It is now believed, that after having lived in one compound with 3 wives and never leaving the house for 5 years, Bin Laden called the U.S. Navy Seals himself.
Pages: [1]
  Print  
 
Jump to:  

Perkiset's Place Home   Best of The Cache   phpMyIDE: MySQL Stored Procedures, Functions & Triggers
Politics @ Perkiset's   Pinkhat's Perspective   
cache
mart
coder
programmers
ajax
php
javascript
Powered by MySQL Powered by PHP Powered by SMF 1.1.2 | SMF © 2006-2007, Simple Machines LLC
Seo4Smf v0.2 © Webmaster's Talks


Valid XHTML 1.0! Valid CSS!