Source code for parsing.library.requester

# Copyright (C) 2017 Semester.ly Technologies, LLC
#
# Semester.ly is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Semester.ly is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

from __future__ import absolute_import, division, print_function

import requests
import cookielib
import sys
import interruptingcow

from fake_useragent import UserAgent
from bs4 import BeautifulSoup


[docs]class Requester(object):

    def __init__(self):
        self.session = requests.Session()
        self.headers = {'User-Agent': UserAgent().random}
        self.cookies = cookielib.CookieJar()  # TODO - maybe this is not needed

[docs]    def new_user_agent(self):
        self.headers['User-Agent'] = UserAgent().random

[docs]    def overwrite_header(self, new_headers):
        self.headers = new_headers

[docs]    def http_request(self, do_http_request, type, parse=True, quiet=True, timeout=60, throttle=(lambda: None)):
        """Perform HTTP request.

        Args:
            do_http_request: function that returns request object
            type (str): GET, POST, HEAD
            parse (bool, optional): Specifies if return should be parsed.
                Autodetects parse type as html, xml, or json.
            quiet (bool, optional): suppress output if True (default True)
            timeout (int, optional): Description
            throttle (lambda, optional): Description

        Returns:
            request object: if parse is False
            soup: soupified/jsonified text of http request
        """
        response = None
        for i in range(10):
            try:
                with interruptingcow.timeout(timeout, exception=requests.exceptions.Timeout):
                    response = do_http_request()
            except (requests.exceptions.Timeout,
                    requests.exceptions.ConnectionError):
                if i > 1:
                    print('THROTTLING REQUESTER', file=sys.stderr)  # TODO - should not be stderr, maybe warning?
                    throttle()
                print("Requester error:",
                      str(sys.exc_info()[0]),
                      file=sys.stderr)
                self.new_user_agent()
                continue

            if response is not None:
                break

            if i > 1:
                print('THROTTLING REQUESTER', file=sys.stderr)  # TODO - should not be stderr, maybe warning?
                throttle()

        if not quiet:
            print(type, response.url)

        if not parse:
            return response

        soup = Requester.markup(response)
        if soup or soup == []:
            return soup
        else:
            return response

[docs]    def get(self, url,
            params='',
            session=None,
            cookies=None,
            headers=None,
            verify=True,
            **kwargs):
        """HTTP GET.

        Args:
            url (str): url to query
            params (dict): payload dictionary of HTTP params (default None)
            cookies (None, optional): Description
            headers (None, optional): Description
            verify (bool, optional): Description
            **kwargs: Description

        Examples:
            TODO
        """
        def request():
            return self.session.get(
                url,
                params=params,
                cookies=self.cookies,
                headers=headers if headers is not None else self.headers,
                verify=verify,
            )

        return self.http_request(request, 'GET', **kwargs)

[docs]    def post(self, url,
             data='',
             params='',
             cookies=None,
             headers=None,
             verify=True,
             **kwargs):
        """HTTP POST.

        Args:
            url (str): url to query
            data (str, optional): HTTP form key-value dictionary
            params (dict): payload dictionary of HTTP params
            cookies (None, optional): Description
            headers (None, optional): Description
            verify (bool, optional): Description
            **kwargs: Description
        """
        def request():
            return self.session.post(
                url,
                data=data,
                params=params,
                cookies=self.cookies,
                headers=headers if headers is not None else self.headers,
                verify=verify,
            )

        return self.http_request(request, 'POST', **kwargs)

    @staticmethod
[docs]    def markup(response):
        """Autodects html, json, or xml format in response.

        Args:
            response: raw response object

        Returns:
            markedup response
        """
        def soupify(parser):
            return BeautifulSoup(response.text, parser)
        if response is None:
            return None
        try:
            return response.json()
        except ValueError:
            pass
        if "</html>"[::-1] in response.text[::-1]:
            return soupify('html.parser')
        else:
            return soupify('lxml')