I am trying to scrap car prices from this website: https://www.lacentrale.fr.
I wrote a simple CrawlSpider that manages to crawl the website and get the information I need. However, when I check the scraped information, it doesn't match the information in the webpage.
For example, for this webpage: https://www.lacentrale.fr/auto-occasion-annonce-87101931938.html.
The price shown by the page is 39880€, but when the information scrapped by my Crawler is 43070€. This is especially strange as when I check my code using the scrapy shell, I get the good information (39880€).
I suspect there is some kind of dynamic change of the price that affects my results, as it seems that when I delete my cookies in my Safari navigator, the prices sometime change. Last precision: in the scrapy settings, I deactivated Cookies otherwise the website would block me.
Could somebody explain to me what is happening, and tell me how to correct this?
I included the code I used, and the information that gets "corrupted" seems to be mostly prices, mileage, year and firstCirculationDate.
Thank you in advance for your help!
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
class BilbasenSpider(CrawlSpider):
name = "lacentrale_scraper"
allowed_domains = ['lacentrale.fr']
start_urls = ['https://www.lacentrale.fr/auto-occasion-annonce-69107784183.html']
rules = [
Rule(LinkExtractor(allow='auto-occasion-annonce',
deny = 'pros.lacentrale.fr'),
callback = 'parse',
follow = True,
cb_kwargs={'is_announce': True}),
Rule(LinkExtractor(allow='.*'),
callback = None,
follow = True,
cb_kwargs={'is_announce': False})
]
def parse(self, response, is_announce):
if is_announce:
yield {
'title': response.css('title::text').extract_first(),
'web_adress': response.request.url,
"make": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""make":"(.*?)""),
"model": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""model":"(.*?)""),
"version": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""version":"(.*?)""),
"year": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""year":"(.*?)""),
"mileage": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(","mileage":([0-9]*),"),
"firstHand": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""firstHand":(.*?),"),
"firstCirculationDate":response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""firstCirculationDate":"(.*?)""),
"averageMileage": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""averageMileage":([0-9]*),"),
"category": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""category":"(.*?)""),
"co2": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""co2":([0-9]*),"),
"origin": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""origin":"(.*?)","),
"conversionBonusEligibility": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""conversionBonusEligibility":(.*?),"),
"energy": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""energy":"(.*?)""),
"critAir": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""critAir":([0-9]*)"),
"owners": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""owners":([0-9]*)"),
"seats": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""seats":([0-9]*)"),
"pollutionNorm": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""pollutionNorm":"(.*?)""),
"externalColor": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""externalColor":"(.*?)""),
"doors": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""doors":([0-9]*)"),
"ratedHorsePower": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""ratedHorsePower":([0-9]*)"),
"powerDIN": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""powerDIN":([0-9]*)"),
"cubic": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""cubic":([0-9]*)"),
"computedCo2": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""computedCo2".*"combined":([0-9]*)"),
"ecoBonusMalus" : response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""computedCo2".*"ecoBonusMalus":([0-9]*)"),
"consumptionCity": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""computedConsumption".*"consumptionCity":(.*?),"),
"computed90": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""computedConsumption".*"consumption90":(.*?),"),
"consumption120": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""computedConsumption".*"consumption120":(.*?)}"),
"price": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""price":([0-9]*)"),
"zipCode": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""zipCode":"(.*?)""),
"lastUpdate": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""lastUpdate":"(.*?)""),
"mileageBadge": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""mileageBadge":"(.*?)""),
"customerType": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""customerType":"(.*?)""),
"reference": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""reference":"(.*? )""),
"referencePro": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""referencePro":"(.*?)""),
"constructorWarranty": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""constructorWarranty":{"mileage":([0-9]*)}"),
"networkWarranty": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""networkWarranty":{"duration":([0-9]*),"),
"equipmentsOrOptionsCount": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""equipmentsOrOptionsCount":([0-9]*)"),
"classifiedId": response.xpath('//div[@class = "cbm-mainColumn"]/script').re (""classifiedId":"(.*?)""),
"vehicleFirstCirculationDate": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""vehicleFirstCirculationDate":"(.*?)""),
"vehicleGearbox": response.xpath('//div[@class = "cbm-mainColumn"]/script').re(""vehicleGearbox":"(.*?)"")
}