python - How to update the new page source everytime in scrapy xpath while using selenium? -


this selenium merged scrapy working fine 1 problem-

i need update sites = response.xpath() every time new source code page generates otherwise returning me repetitive results again , again.

import scrapy scrapy.contrib.spiders import crawlspider, rule scrapy.selector import selector scrapy.http import textresponse scrapy.contrib.linkextractors.sgml import sgmllinkextractor urlparse import urljoin selenium import webdriver import time   class product(scrapy.item):     title = scrapy.field()   class foospider(crawlspider):     name = 'foo'      start_urls = ["https://www.example.com"]      def __init__(self, *args, **kwargs):         super(foospider, self).__init__(*args, **kwargs)         self.download_delay = 0.25         self.browser = webdriver.chrome(executable_path="c:\chrm\chromedriver.exe")         self.browser.implicitly_wait(60) #       def parse(self,response):         self.browser.get(response.url)         sites = response.xpath('//div[@class="single-review"]/div[@class="review-header"]')          in range(0,200):             items = []             time.sleep(20)             button = self.browser.find_element_by_xpath("/html/body/div[4]/div[6]/div[1]/div[2]/div[2]/div[1]/div[2]/button[1]/div[2]/div/div")             button.click()             self.browser.implicitly_wait(30)              site in sites:                 item = product()                  item['title'] = site.xpath('.//div[@class="review-info"]/span[@class="author-name"]/a/text()').extract()                 yield item 

you need create new selector instance in loop after click passing current page source .page_source:

from scrapy.selector import selector  self.browser.implicitly_wait(30)  in range(0,200):     time.sleep(20)  # todo: delay doesn't      button = self.browser.find_element_by_xpath("/html/body/div[4]/div[6]/div[1]/div[2]/div[2]/div[1]/div[2]/button[1]/div[2]/div/div")     button.click()      sel = selector(text=self.browser.page_source)     sites = sel.xpath('//div[@class="single-review"]/div[@class="review-header"]')      site in sites:         item = product()          item['title'] = site.xpath('.//div[@class="review-info"]/span[@class="author-name"]/a/text()').extract()         yield item 

note need call implicitly_wait() once - doesn't add immediate delay - instructs selenium wait x seconds when searching elements.

also, doubt need time.sleep(20) call. instead, may want start using explicit waits.


Comments