Getting a response body with scrapy splash
我正在使用 scrapy 1.6 和 splash 3.2 我有:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 | import scrapy import random from scrapy_splash import SplashRequest from scrapy.utils.response import open_in_browser from scrapy.linkextractors import LinkExtractor USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:48.0) Gecko/20100101 Firefox/48.0' class MySpider(scrapy.Spider): start_urls = ["http://yahoo.com"] name = 'mytest' def start_requests(self): for url in self.start_urls: yield SplashRequest(url, self.parse, endpoint='render.html', args={'wait': 2.5},headers={'User-Agent': USER_AGENT,'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'}) def parse(self, response): # response.body is a result of render.html call; it # contains HTML processed by a browser. # from scrapy.http.response.html import HtmlResponse # ht = HtmlResponse('jj') # ht.body.replace =response open_in_browser(response) return None |
问题是,当我尝试在浏览器中打开响应时,我却在记事本中打开它。
查看 https://splash.readthedocs.io/en/stable/scripting-response-object.html。如何激活 response.body 以便我可以在浏览器中打开响应(然后我希望能够使用浏览器开发工具来获取 xpath)?
我得到了它:
1 2 3 4 5 6 7 | def parse(self, response): # response.body is a result of render.html call; it # contains HTML processed by a browser. from scrapy.http.response.html import HtmlResponse ht = HtmlResponse(url=response.url, body=response.body, encoding="utf-8", request=response.request) open_in_browser(ht) return None |
您可以暂时以适合您的用例的方式重新实现