script to pull all thumbs from a page.

To The Live Thread

Thread: script to pull all thumbs from a page.

Back to category: C/++/#, Pascal etc.

nop_90

uses libxml2dom
http://www.boddie.org.uk/

python

/libxml2dom.html
basically you supply it the url of the gallery
the complete url of all the thumbs will be returned as a list.

def get_thumbs(url):
tree = libxml2dom.parseURI(url,1)
anchors = tree.getElementsByTagName("a" Applause

#(g_proto,g

_net

loc,g_path,g_params,g_query) = urlparse.urlsplit(url)
result = []
thumb_exts = [".jpg",".gif",".avi",".mpg",".wmv"]
for anchor in anchors :
href = anchor.getAttribute("href" Applause

#print urlparse.urlsplit(href)
(proto

,net

loc,path,params,query) = urlparse.urlsplit(href)
(root,ext) = os.path.splitext(path.lower())
if thumb_exts.count(ext)>0 :
imgs = anchor.getElementsByTagName("img" Applause

if len(imgs)>0 :
img_src = imgs[0].getAttribute("src" Applause

result.append(urlparse.urljoin(url,img_src))
if len(result)<=0 :
print url
raise "Error"
return result

Thread Categories

		Best of The Cache Home
		Search The Cache