PyQt5 to scrape IMDb webpage
我现在已经开始使用python进行web抓取了,我想从这个链接中抓取图像。这是"检查"的截图。这是我尝试的代码,因为它涉及到JavaScript。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 | import bs4 as bs import sys import urllib.request from PyQt5.QtWebEngineWidgets import QWebEnginePage from PyQt5.QtWidgets import QApplication from PyQt5.QtCore import QUrl class Page(QWebEnginePage): def __init__(self, url): self.app = QApplication(sys.argv) QWebEnginePage.__init__(self) self.html = '' self.loadFinished.connect(self._on_load_finished) self.load(QUrl(url)) self.app.exec_() def _on_load_finished(self): self.html = self.toHtml(self.Callable) print('Load finished') def Callable(self, html_str): self.html = html_str self.app.quit() def main(): page = Page('https://www.imdb.com/name/nm0005683/mediaviewer/rm2073384192') soup = bs.BeautifulSoup(page.html, 'html.parser') imagetag = soup.find('div', id='photo-container') print (imagetag) if __name__ == '__main__': main() |
这个代码实际上是从这里来的,我只是修改了链接
我犯的错误
1 2 | js: Uncaught TypeError: Cannot read property 'x' of undefined Load finished |
号
我不知道实际的错误是什么,的内容没有显示出来,我确实尝试了谷歌搜索错误,但找不到任何可以帮助这种情况的东西。另外,如果我应该尝试其他方法来刮除图像而不是这个,我也会接受这些建议。
PS:我也不熟悉stackoverflow,所以如果这里有什么不符合规则的地方,我可以根据需要编辑这个问题。
您可能希望使用WebChannel来完成实际工作,但下面将向您展示如何访问您要查找的图像。我会把网络频道调查留给你。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | import sys from PyQt5.QtWebEngineWidgets import QWebEngineView, QWebEnginePage from PyQt5.QtWidgets import QApplication from PyQt5.QtCore import QUrl, QTimer class Page(QWebEnginePage): def __init__(self, parent): QWebEnginePage.__init__(self, parent) self.html = '' self.loadFinished.connect(self._on_load_finished) def _on_load_finished(self): print('Load finished') QTimer.singleShot(1000, self._after_loading) # load finished does not mean rendered..may need to wait here QTimer.singleShot(5000, self._exit) def _after_loading(self): print('_after_loading') js = '''console.log('javascript...'); var images = document.querySelectorAll('#photo-container img'); console.log('images ' + images); console.log('images ' + images.length); for (var i = 0; i < images.length; i++) { var image = images[i]; console.log(image.src); } var element = document.querySelector('body'); //console.log(element.innerHTML); // If you uncomment this you'll see the the photo-container is still empty ''' self.runJavaScript(js) print('_after_loading...done') def _exit(self): print('_exit') QApplication.instance().quit() def javaScriptConsoleMessage(self, level: QWebEnginePage.JavaScriptConsoleMessageLevel, message: str, lineNumber: int, sourceID: str): print(message) def main(): app = QApplication(sys.argv) w = QWebEngineView() w.setPage(Page(w)) w.load(QUrl('https://www.imdb.com/name/nm0005683/mediaviewer/rm2073384192')) w.show() app.exec_() if __name__ == '__main__': main() |