this selenium merged scrapy working fine 1 problem-
i need update sites = response.xpath()
every time new source code page generates otherwise returning me repetitive results again , again.
import scrapy scrapy.contrib.spiders import crawlspider, rule scrapy.selector import selector scrapy.http import textresponse scrapy.contrib.linkextractors.sgml import sgmllinkextractor urlparse import urljoin selenium import webdriver import time class product(scrapy.item): title = scrapy.field() class foospider(crawlspider): name = 'foo' start_urls = ["https://www.example.com"] def __init__(self, *args, **kwargs): super(foospider, self).__init__(*args, **kwargs) self.download_delay = 0.25 self.browser = webdriver.chrome(executable_path="c:\chrm\chromedriver.exe") self.browser.implicitly_wait(60) # def parse(self,response): self.browser.get(response.url) sites = response.xpath('//div[@class="single-review"]/div[@class="review-header"]') in range(0,200): items = [] time.sleep(20) button = self.browser.find_element_by_xpath("/html/body/div[4]/div[6]/div[1]/div[2]/div[2]/div[1]/div[2]/button[1]/div[2]/div/div") button.click() self.browser.implicitly_wait(30) site in sites: item = product() item['title'] = site.xpath('.//div[@class="review-info"]/span[@class="author-name"]/a/text()').extract() yield item
you need create new selector
instance in loop after click passing current page source .page_source
:
from scrapy.selector import selector self.browser.implicitly_wait(30) in range(0,200): time.sleep(20) # todo: delay doesn't button = self.browser.find_element_by_xpath("/html/body/div[4]/div[6]/div[1]/div[2]/div[2]/div[1]/div[2]/button[1]/div[2]/div/div") button.click() sel = selector(text=self.browser.page_source) sites = sel.xpath('//div[@class="single-review"]/div[@class="review-header"]') site in sites: item = product() item['title'] = site.xpath('.//div[@class="review-info"]/span[@class="author-name"]/a/text()').extract() yield item
note need call implicitly_wait()
once - doesn't add immediate delay - instructs selenium
wait x seconds when searching elements.
also, doubt need time.sleep(20)
call. instead, may want start using explicit waits
.
Comments
Post a Comment