Source code for parsing.library.extractor

# Copyright (C) 2017 Semester.ly Technologies, LLC
#
# Semester.ly is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Semester.ly is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

from __future__ import absolute_import, division, print_function

# NOTE: module currently unused as it introduces too many bugs.
#       Might reconsider for later use.

import re
# import unicodedata

from collections import namedtuple

from parsing.library.utils import make_list

Extraction = namedtuple('Extraction', 'key container patterns')


[docs]def extract_info_from_text(text,
                           inject=None,
                           extractions=None,
                           use_lowercase=True,
                           splice_text=True):
    """Attempt to extract info from text and put it into course object.

    NOTE: Currently unstable and unused as it introduces too many bugs.
          Might reconsider for later use.

    Args:
        text (str): text to attempt to extract information from
        extractions (None, optional): Description
        inject (None, optional): Description
        use_lowercase (bool, optional): Description

    Returns:
        str: the text trimmed of extracted information
    """
    # text = text.encode('utf-8', 'ignore')
    if extractions is None:
        extractions = (
            Extraction(
                key='prereqs',
                container=make_list,
                patterns=(r'pr-?ereq(?:uisite)?s?[:,\s]\s*(.*?)(?:\.|$)\s*',
                          r'take (.*)\.?$')
            ),
            Extraction(
                key='coreqs',
                container=make_list,
                patterns=(r'co-?req(?:uisite)?s?[:,\s]\s*(.*?)(?:\.|$)\s*',)
            ),
            Extraction(
                key='geneds',
                container=make_list,
                patterns=(r'ge (.*)',)
            ),
            Extraction(
                key='fee',
                container=float,
                patterns=(
                    r'(?:lab )?fees?:?\s{1,2}?\$?\s?(\d+(?:\.\d{1,2})?)',)
            )
        )

    # Search for matches.
    extracted = inject or {}
    for key, container, patterns in extractions:
        for pattern in patterns:
            match = re.search(pattern, text.lower() if use_lowercase else text)
            if not match:
                continue
            try:
                contained = container(text[match.start() + match.group().index(match.group(1)): match.start() + match.group().index(match.group(1)) + len(match.group(1))])  # magic...
                default = extracted.setdefault(key, container())
                default += contained
                if splice_text:
                    text = text[:match.start()] + text[match.end():]
            except:
                continue
        # if isinstance(text, basestring):
        #     text = text.decode('utf-8')
        #     text = unicodedata.normalize('NFKD', text)

    if not inject:
        return text, extracted
    print(text)
    return text