Hi all, i've just create some code to download some files simultaneously and code itself based on python books,
import os, string, htmllib, urllib, formatter, urlparse, re
Dir = "C:\\test\\"
Html_ext = { "":1, "HTM":1, "PHTML":1, "SHTML":1, "HTML":1, "ASP":1, "PHP":1, "PHP3":1 }
URL_string = raw_input("Enter page: ")
URLre = re.compile(re.escape(URL_string), re.IGNORECASE)
VisitedURL = {}
TargetURLList = [URL_string]
def AddURL(NewURL):
if (VisitedURL.has_key(NewURL)):
return
if (not URLre.search(NewURL)):
return
TargetURLList.append(NewURL)
def GetExtension(FileString):
DotChunks = string.split(FileString, ".")
if len(DotChunks) == 1:
return ""
LastBlock = DotChunks[-1]
if string.find(LastBlock, "/") != 1:
return ""
if string.find(LastBlock, "\\") != -1:
return ""
return string.upper(LastBlock)
class HTMLBot(htmllib.HTMLParser):
def newPage(self, BaseURL):
self.BaseURL = BaseURL
def __init__(self):
htmllib.HTMLParser.__init__(self, formatter.NullFormatter())
def body(self, args):
for ValTuple in args:
if string.upper(ValTuple[0]) == "BACKGROUND":
ImageURL = urlparse.urljoin(self.BaseURL, ValTuple[1])
AddURL(ImageURL)
def embed(self, args):
for ValTuple in args:
if string.upper(ValTuple[0]) == "SRC":
self.HandleAnchor(ValTuple[1])
def area(self, args):
for ValTuple in args:
if string.upper(ValTuple[0]) == "HREF":
self.HandleAnchor(ValTuple[1])
def handle_image(self, source, alt, ismap, align, width, height):
ImageURL = urlparse.urljoin(self.BaseURL, source)
AddURL(ImageURL)
def anchor_bgn(self, TempURL, name, type):
if TempURL[0:7].upper() == "MAILTO: ":
return
NewURL = urlparse.urljoin(self.BaseURL, TempURL)
AddURL(NewURL)
def frame(self, args):
for ValTuple in args:
if string.upper(ValTuple[0]) == "SRC":
self.anchor_bgn(ValTuple[1], "", "")
def option(self, args):
for ValTuple in args:
if string.upper(ValTuple[0]) == "VALUE":
TheExtension = GetExtensionFromString(ValTuple[1])
if Html_ext.has_key(TheExtension):
self.anchor_bgn(ValTuple[1], "", "")
if __name__ == "__main__":
Parse = HTMLBot()
while (len(TargetURLList) > 0):
NextURL = TargetURLList[0]
del TargetURLList[0]
VisitedURL[NextURL] = 1
print "Retrieving: ", NextURL
URLTuple = urlparse.urlparse(NextURL, "http", 0)
TheExtension = GetExtension(URLTuple[2])
TargetPath = os.path.normpath(Dir + URLTuple[2])
if (TheExtension == ""):
TargetDir = TargetPath
TargetPath = os.path.normpath(TargetPath + "index.html")
else:
(TargetDir, TargetFile) = os.path.split(TargetPath)
try:
os.makedirs(TargetDir)
except:
pass
if Html_ext.has_key(TheExtension):
URLFile = urllib.urlopen(NextURL)
HTMLText = URLFile.read()
URLFile.close()
HTMLFile = open(TargetPath, "w")
HTMLFile.write(HTMLText)
HTMLFile.close()
Parse.newPage(NextURL)
Parse.feed(HTMLText)
Parse.close()
else:
urllib.urlretrieve(NextURL, TargetPath)
it's work to retrieve all files from the url but the purpose i've made this is for download manga jpg files from onemanga.com, for examples i want to downloads
chapter 1 of pluto manga from this url
http://www.onemanga.com/Pluto/1/ when i execute code it's make an error:
Retrieving: http://www.onemanga.com/Pluto/1/
Retrieving: http://www.onemanga.com/Pluto/1/01/
Retrieving: http://www.onemanga.com/Pluto/1/01/Malware / Virus
Traceback (most recent call last):
File "webbot.py", line 90, in <module>
HTMLFile = open(TargetPath, "w")
IOError: [Errno 2] No such file or directory: 'C:\\python254\\RobotFiles\\Pluto\\1\\01\\Malware \\ Virus\\index.html'
how to solve this issue ?, i've been add ImageURL to handle this thing and try to add JPG extension but not effects or i've been made mistake, please
correct me if i'm wrong.
And also is there any way to learn/get along with module python in easy way like just read the help of the module, so i can try to work/create code using the module for test about the list functions of the module cause as i know every module have differents help documentation which sometimes confused and sometimes easy(this is just personal opinion

), need advices please and thank's a lot.

Regards,
nubie