Source code for parsing.library.extractor

# Copyright (C) 2017 Semester.ly Technologies, LLC
#
# Semester.ly is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Semester.ly is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

from __future__ import absolute_import, division, print_function

# NOTE: module currently unused as it introduces too many bugs.
#       Might reconsider for later use.

import re
# import unicodedata

from collections import namedtuple

from parsing.library.utils import make_list

Extraction = namedtuple('Extraction', 'key container patterns')


[docs]def extract_info_from_text(text, inject=None, extractions=None, use_lowercase=True, splice_text=True): """Attempt to extract info from text and put it into course object. NOTE: Currently unstable and unused as it introduces too many bugs. Might reconsider for later use. Args: text (str): text to attempt to extract information from extractions (None, optional): Description inject (None, optional): Description use_lowercase (bool, optional): Description Returns: str: the text trimmed of extracted information """ # text = text.encode('utf-8', 'ignore') if extractions is None: extractions = ( Extraction( key='prereqs', container=make_list, patterns=(r'pr-?ereq(?:uisite)?s?[:,\s]\s*(.*?)(?:\.|$)\s*', r'take (.*)\.?$') ), Extraction( key='coreqs', container=make_list, patterns=(r'co-?req(?:uisite)?s?[:,\s]\s*(.*?)(?:\.|$)\s*',) ), Extraction( key='geneds', container=make_list, patterns=(r'ge (.*)',) ), Extraction( key='fee', container=float, patterns=( r'(?:lab )?fees?:?\s{1,2}?\$?\s?(\d+(?:\.\d{1,2})?)',) ) ) # Search for matches. extracted = inject or {} for key, container, patterns in extractions: for pattern in patterns: match = re.search(pattern, text.lower() if use_lowercase else text) if not match: continue try: contained = container(text[match.start() + match.group().index(match.group(1)): match.start() + match.group().index(match.group(1)) + len(match.group(1))]) # magic... default = extracted.setdefault(key, container()) default += contained if splice_text: text = text[:match.start()] + text[match.end():] except: continue # if isinstance(text, basestring): # text = text.decode('utf-8') # text = unicodedata.normalize('NFKD', text) if not inject: return text, extracted print(text) return text