Friday, 2 August 2019

How to capture multiple responses on one single POST request using Scrapy?

I am trying to web scrape this website and download the pdf files available when you complete the whole lifecycle of this website. I am using Scrapy for this. I am having some trouble with capturing the captcha at the right time.

This site is an ASPX webpage and uses 'Viewstates' to keep track of each POST requests. Now, if you go through this site, you'll understand that whenever you fill any dropdown fields, it sends POST request with 'Viewstate' value to a certain URL path, which you can see in the browser console. But at the same time, it sends another GET request to another URL to fetch the "CAPTCHA" image. But I am not able to get this response. I don't have any idea whether using Scrapy can we capture multiple requests multiple responses at the same time.

enter image description here

enter image description here

Now, I tried to find a workaround for this issue. And I have followed almost everything mentioned in this StackOverflow post, but in response I am getting HTML code with javascript alert code mentioning "Wrong text inserted, Please enter new characters shown in image textbox". So, this solution is also not working for me.

This is my scrapy spider code:

# -*- coding: utf-8 -*-
import scrapy
import cv2
import pytesseract
from PIL import Image
from io import BytesIO
from election_data.items  import ElectionDataItem

class ElectionSpider(scrapy.Spider):
    name = 'election'
    allowed_domains = ['ceo.maharashtra.gov.in']
    start_urls = ['https://ceo.maharashtra.gov.in/searchlist/SearchRollPDF.aspx']
    dist_dict = []

    def parse(self, response):
        district = response.css('select#Content_DistrictList > option::attr(value)')[1].extract()
        data = {
            '__EVENTTARGET' : response.css('select#Content_DistrictList::attr(name)').extract_first(),
            '__EVENTARGUMENT' : '',
            '__LASTFOCUS' : '', 
            '__VIEWSTATE' : response.css('input#__VIEWSTATE::attr(value)').extract_first(),
            '__EVENTVALIDATION' : response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
            'ctl00$Content$DistrictList' : district,
            'ctl00$Content$txtcaptcha' : ''
        }
        meta = {'handle_httpstatus_all': True}
        request = scrapy.FormRequest(url=self.start_urls[0], method='POST', formdata=data, meta=meta, callback=self.parse_assembly)
        request.meta['district'] = district
        yield request

    def parse_assembly(self, response):
        print('parse_assembly')
        assembly = response.css('select#Content_AssemblyList > option::attr(value)')[1].extract()
        data = {
            '__EVENTTARGET' : response.css('select#Content_AssemblyList::attr(name)').extract_first(),
            '__EVENTARGUMENT' : '',
            '__LASTFOCUS' : '', 
            '__VIEWSTATE' : response.css('input#__VIEWSTATE::attr(value)').extract_first(),
            '__EVENTVALIDATION' : response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
            'ctl00$Content$DistrictList' : response.meta['district'],
            'ctl00$Content$AssemblyList' : assembly,
            'ctl00$Content$txtcaptcha' : ''
        }
        meta = {'handle_httpstatus_all': True}
        request = scrapy.FormRequest(url=self.start_urls[0], method='POST', formdata=data, meta=meta, callback=self.parse_part)
        request.meta['district'] = response.meta['district']
        request.meta['assembly'] = assembly
        yield request

    def parse_part(self, response):
        print('parse_part')
        part = response.css('select#Content_PartList > option::attr(value)')[1].extract()
        data = {
            '__EVENTTARGET' : response.css('select#Content_PartList::attr(name)').extract_first(),
            '__EVENTARGUMENT' : '',
            '__LASTFOCUS' : '', 
            '__VIEWSTATE' : response.css('input#__VIEWSTATE::attr(value)').extract_first(),
            '__EVENTVALIDATION' : response.css('input#__EVENTVALIDATION::attr(value)').extract_first(),
            'ctl00$Content$DistrictList' : response.meta['district'],
            'ctl00$Content$AssemblyList' : response.meta['assembly'],
            'ctl00$Content$PartList' : part,
            'ctl00$Content$txtcaptcha' : ''
        }
        meta = {'handle_httpstatus_all': True}
        request = scrapy.FormRequest(url=self.start_urls[0], method='POST', formdata=data, meta=meta, callback=self.parse_captcha)
        request.meta['__VIEWSTATE'] = response.css('input#__VIEWSTATE::attr(value)').extract_first()
        request.meta['__EVENTVALIDATION'] = response.css('input#__EVENTVALIDATION::attr(value)').extract_first()
        request.meta['district'] = response.meta['district']
        request.meta['assembly'] = response.meta['assembly']
        request.meta['part'] = part
        yield request

    def parse_captcha(self, response):
        data_for_later = response
        request = scrapy.Request(url='https://ceo.maharashtra.gov.in/searchlist/Captcha.aspx', callback=self.store_image)
        request.meta['__VIEWSTATE'] = response.meta['__VIEWSTATE']
        request.meta['__EVENTVALIDATION'] = response.meta['__EVENTVALIDATION']
        request.meta['district'] = response.meta['district']
        request.meta['assembly'] = response.meta['assembly']
        request.meta['part'] = response.meta['part']
        request.meta['data_for_later'] = data_for_later
        yield request

    def store_image(self, response):
        captcha_target_filename = 'filename.png'
        # save the image for processing
        i = Image.open(BytesIO(response.body))
        i.save(captcha_target_filename)
        captcha_text = self.solve_captcha(captcha_target_filename)
        print(captcha_text)
        data = {
            '__EVENTTARGET' : '',
            '__EVENTARGUMENT' : '',
            '__LASTFOCUS' : '', 
            '__VIEWSTATE' : response.meta['__VIEWSTATE'],
            '__EVENTVALIDATION' : response.meta['__EVENTVALIDATION'],
            'ctl00$Content$DistrictList' : response.meta['district'],
            'ctl00$Content$AssemblyList' : response.meta['assembly'],
            'ctl00$Content$PartList' : response.meta['part'],
            'ctl00$Content$txtcaptcha' : captcha_text,
            'ctl00$Content$OpenButton': 'Open PDF'
        }
        captcha_form = response.meta['data_for_later']
        meta = {'handle_httpstatus_all': True}
        request = scrapy.FormRequest.from_response(captcha_form, method='POST', formdata=data, meta=meta, callback=self.get_pdfs)
        yield request

    def get_pdfs(self, response):
        # THIS IS WHERE FINAL RESPONSE IS CAPTURED
        print(response.text)

    def solve_captcha(self, image):
        image = cv2.imread(image,0)
        thresh = cv2.threshold(image, 220, 255, cv2.THRESH_BINARY)[1]

        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
        close = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)

        result = 255 - close
        cv2.imshow('thresh', thresh)
        cv2.imshow('close', close)
        cv2.imshow('result', result)

        return pytesseract.image_to_string(result)

If you go through the above-mentioned site and fill all the form details, monitor the browser consols network tab, you'll get an idea about this problem.

Kindly guide me in how to solve this issue. Thank you.



from How to capture multiple responses on one single POST request using Scrapy?

No comments:

Post a Comment