Python BeautifulSoup web image crawler IOError: [Errno 2] No such file or directory
我编写了下面的python代码来对网站www.style.com中的图像进行爬行
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 | import urllib2, urllib, random, threading from bs4 import BeautifulSoup import sys reload(sys) sys.setdefaultencoding('utf-8') class Images(threading.Thread): def __init__(self, lock, src): threading.Thread.__init__(self) self.src = src self.lock = lock def run(self): self.lock.acquire() urllib.urlretrieve(self.src,'./img/'+str(random.choice(range(9999)))) print self.src+'get' self.lock.release() def imgGreb(): lock = threading.Lock() site_url ="http://www.style.com" html = urllib2.urlopen(site_url).read() soup = BeautifulSoup(html) img=soup.findAll(['img']) for i in img: print i.get('src') Images(lock, i.get('src')).start() if __name__ == '__main__': imgGreb() |
但我得到了这个错误:
IOError: [Errno 2] No such file or directory: '/images/homepage-2013-october/header/logo.png'
如何解决?
也可以递归地查找网站中的所有图像吗?我指的是主页上没有的其他图片。
谢谢!
直接尝试下面的代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | import urllib2 from bs4 import BeautifulSoup import sys from urllib import urlretrieve reload(sys) def imgGreb(): site_url ="http://www.style.com" html = urllib2.urlopen(site_url).read() soup = BeautifulSoup(html) img=soup.findAll(['img']) for i in img: try: # built the complete URL using the domain and relative url you scraped url = site_url + i.get('src') # get the file name name ="result_" + url.split('/')[-1] # detect if that is a type of pictures you want type = name.split('.')[-1] if type in ['jpg', 'png', 'gif']: # if so, retrieve the pictures urlretrieve(url, name) except: pass if __name__ == '__main__': imgGreb() |