Source code for parsing.library.validator

# Copyright (C) 2017 Semester.ly Technologies, LLC
#
# Semester.ly is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Semester.ly is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.


# TODO - consider something to load db field sizes into validator
#        However, that would ruin the purity of the adapter.

from __future__ import absolute_import, division, print_function

import dateutil.parser as dparser
import httplib
import jsonschema
import logging
import re
import simplejson as json

# Contains BASE_DIR and PARSING_MODULE.
from django.conf import settings

from parsing.library.tracker import Tracker
from parsing.library.exceptions import PipelineError, PipelineWarning
from parsing.library.utils import DotDict, dir_to_dict, SimpleNamespace


[docs]class ValidationError(PipelineError): """Validator error class."""
[docs]class ValidationWarning(PipelineWarning): """Validator warning class."""
[docs]class MultipleDefinitionsWarning(ValidationWarning): """Duplicated key in data definition."""
[docs]class Validator: """Validation engine in parsing data pipeline. Attributes: config (:obj:`DotDict`): Loaded config.json. course_code_regex (:obj:`re`): Regex to match course code. kind_to_validation_function (:obj:`dict`): Map kind to validation function defined within this class. KINDS (:obj:`set`): Kinds of objects that validator validates. relative (:obj:`bool`): Enforce relative ordering in validation. seen (:obj:`dict`): Running monitor of seen courses and sections tracker (:obj:`parsing.library.tracker.Tracker`) """ KINDS = { 'config', 'datalist', 'course', 'section', 'meeting', 'directory', 'eval', 'instructor', 'final_exam', 'textbook', 'textbook_link', } def __init__(self, config, tracker=None, relative=True): """Construct validator instance. Args: config (dict): School config dictionary. tracker (None, optional): Description relative (bool, optional): Enforce relative ordering in validation. """ Validator.load_schemas() self.kind_to_validation_function = { kind: getattr(self, 'validate_' + kind) if hasattr(self, 'validate_' + kind) else lambda *_, **__: None for kind in Validator.KINDS } # Running monitor of validated course and section codes. self.seen = {} self.config = DotDict(config) self.config['kind'] = 'config' self.validate(self.config) self.course_code_regex = re.compile(self.config.course_code_regex) self.relative = relative if tracker is None: # Used during self-contained validation. self.tracker = Tracker() self.tracker.school = self.config.school.code self.tracker.mode = 'validating' self.tracker.start() else: self.tracker = tracker @classmethod
[docs] def load_schemas(cls, schema_path=None): """Load JSON validation schemas. NOTE: Will load schemas as static variable (i.e. once per definition), unless schema_path is specifically defined. Args: schema_path (None, str, optional): Override default schema_path """ if hasattr(cls, 'SCHEMAS') and schema_path is None: return if schema_path is None: schema_path = '{}/{}/library/schemas'.format( settings.BASE_DIR, settings.PARSING_MODULE ) def load(kind): filepath = '{}/{}.json'.format(schema_path, kind) with open(filepath, 'r') as file: schema = json.load(file) resolved = jsonschema.RefResolver( 'file://{}/'.format(schema_path), schema ) return (schema, resolved) cls.SCHEMAS = DotDict({ kind: load(kind) for kind in cls.KINDS })
# TODO - make into a namedtuple instead @staticmethod
[docs] def schema_validate(data, schema, resolver=None): """Validate data object with JSON schema alone. Args: data (dict): Data object to validate. schema: JSON schema to validate against. resolver (None, optional): JSON Schema reference resolution. Raises: jsonschema.exceptions.ValidationError: Invalid object. """ try: jsonschema.Draft4Validator(schema, resolver=resolver).validate(data) except jsonschema.exceptions.ValidationError as e: raise ValidationError(data, *e.args)
# TODO - Create iter_errors from jsonschema validator # NOTE: if modifying schemas it may be prudent to catch: # jsonschema.exceptions.SchemaError # jsonschema.exceptions.RefResolutionError @staticmethod
[docs] def file_to_json(path, allow_duplicates=False): """Load file pointed to by path into json object dictionary. Args: path (str): allow_duplicates (bool, optional): Allow duplicate keys in JSON. Returns: dict: JSON-compliant dictionary. """ def raise_on_duplicates(ordered_pairs): """Reject duplicate keys in dictionary.""" d = {} for k, v in ordered_pairs: if k in d: raise ValidationError("duplicate key: %r" % (k,)) d[k] = v return d with open(path, 'r') as f: if allow_duplicates: return json.load(f) return json.load(f, object_pairs_hook=raise_on_duplicates)
[docs] def validate(self, data, transact=True): """Validation entry/dispatcher. Args: data (list, dict): Data to validate. """ if transact: self.transaction = SimpleNamespace(key=None, values=set()) data = DotDict(data) Validator.schema_validate(data, *Validator.SCHEMAS[data.kind]) self.kind_to_validation_function[data.kind](data) if transact and self.transaction.key: self.seen.setdefault(self.transaction.key, set()).update(self.transaction.values)
[docs] def validate_self_contained(self, data_path, break_on_error=True, break_on_warning=False, output_error=None, display_progress_bar=True, master_log_path=None): """Validate JSON file as without ingestor. Args: data_path (str): Path to data file. break_on_error (bool, optional): Description break_on_warning (bool, optional): Description output_error (None, optional): Error output file path. display_progress_bar (bool, optional): Description master_log_path (None, optional): Description break_on_error (bool, optional) break_on_warning (bool, optional) display_progress_bar (bool, optional) Raises: ValidationError: Description """ data = Validator.file_to_json(data_path)['$data'] # Validator.schema_validate(data, *Validator.SCHEMAS.datalist) for obj in map(DotDict, data): try: self.validate(obj) self.tracker.stats = dict(kind=obj.kind, status='valid') except ValidationError as e: logging.exception('Validation error') if break_on_error: raise ValidationError(*e.args) except ValidationWarning as e: logging.warn(e) # warnings.warn('', e, stacklevel=2) self.tracker.stats = dict(kind=obj.kind, status='total') # TODO - this should be handled by caller self.tracker.end()
[docs] def validate_course(self, course): """Validate course. Args: course (DotDict): Course object to validate. Raises: MultipleDefinitionsWarning: Course has already been validated in same session. ValidationError: Invalid course. """ if 'kind' in course and course.kind != 'course': raise ValidationError(course, 'course object must be of kind course') if ('school' in course and course.school.code != self.config.school.code): raise ValidationError(course, 'course schools does not match config') if self.course_code_regex.match(course.code) is None: raise ValidationError( course, "course code {} does not match r'{}'".format( course.code, self.config.course_code_regex ) ) if ('department' in course and 'code' in course.department and 'departments' in self.config): department_codes = {d.code for d in self.config.departments} if course.department.code not in department_codes: raise ValidationError( course, 'department {} is not in config.json departments'.format( course.department) ) if 'homepage' in course: self.validate_website(course.homepage) for sa in course.get('same_as', []): if self.course_code_regex.match(sa) is not None: continue # raise ValidationError( # course, # "same as course code {} does not match r'{}'".format( # course.code, # self.config.course_code_regex # ) # ) if self.relative: if course.code in self.seen: raise MultipleDefinitionsWarning( course, 'multiple definitions of course {}'.format(course.code) ) self.transaction.key = course.code for section in course.get('sections', []): if ('course' in section and section['course']['code'] != course.code): raise ValidationError( course, 'nested {} does not match parent {}'.format( section['course']['code'], course.code ) ) # NOTE: mutating dictionary section['course'] = {'code': course.code} section['kind'] = 'section' self.validate(DotDict(section), transact=False)
[docs] def validate_section(self, section): """Validate section object. Args: section (DotDict): Section object to validate. Raises: MultipleDefinitionsWarning: Invalid section. ValidationError: Description """ if 'course' not in section: raise ValidationError(section, 'section doesnt define a parent course') if 'kind' in section and section.kind != 'section': raise ValidationError(section, 'section must be of kind section') if ('course' in section and self.course_code_regex.match(section.course.code) is None): raise ValidationError( section, 'course code {} does not match r\'{}\''.format( section.course.code, self.config.course_code_regex ) ) if 'term' in section and section.term not in self.config.terms: raise ValidationError( section, 'term {} not in config.json term list'.format(section.term) ) if 'instructors' in section: db_instructor_textfield_max_size = 500 instructor_textfield = '' for instructor in section.get('instructors', []): instructor = DotDict(instructor) if isinstance(instructor.name, basestring): instructor_textfield += instructor.name elif isinstance(instructor.name, dict): instructor_textfield += '{} {}'.format(instructor.name.first, instructor.name.last) db_instructor_textfield_size = len(instructor_textfield) if db_instructor_textfield_size > db_instructor_textfield_max_size: raise ValidationError( section, 'db field too small for comma-joined instructor names' ) for instructor in section.get('instructors', []): self.validate_instructor(instructor) if 'final_exam' in section: if ('course' in section.final_exam and section.final_exam.course.code != section.course.code): raise ValidationError( section, 'final exam course {} doesnt match course code {}'.format( section.final_exam.course.code, section.course.code ) ) if ('section' in section.final_exam and section.final_exam.section.code != section.code): raise ValidationError( section, 'final exam section {} doesnt match section {}'.format( section.final_exam.section.code, section.code ) ) # final_exam['course'] = section.course # final_exam['section'] = {'code': section.code} # self.validate_final_exam(section.final_exam) if self.relative: if section.course.code not in self.seen and self.transaction.key != section.course.code: print(self.seen) raise ValidationError( 'course code {} isnt defined'.format(section.course.code), section ) elif ((section.code, section.year, section.term) in self.seen.get(section.course.code, set()) | self.transaction.values): raise MultipleDefinitionsWarning( section, 'multiple defs for {} {} - {} already defined'.format( section.course.code, section.code, section.year ) ) self.transaction.key = section.course.code self.transaction.values.add((section.code, section.year, section.term)) for meeting in section.get('meetings', []): meeting = DotDict(meeting) if ('course' in meeting and meeting.course.code != section.course.code): raise ValidationError( section, 'course code {} in meeting doesnt match parent section \ course code {}'.format( meeting.course.code, section.course.code ) ) if 'section' in meeting and meeting.section.code != section.code: raise ValidationError( section, 'section code {} in nested meeting doesnt match parent \ section code {}'.format( meeting.section.code, section.code ) ) # NOTE: mutating obj meeting['course'] = section.course meeting['section'] = { 'code': section.code, 'year': section.year, 'term': section.term } meeting['kind'] = 'meeting' self.validate(DotDict(meeting), transact=False) if 'textbooks' in section: for textbook in section.textbooks: self.validate_textbook_link(textbook)
[docs] def validate_meeting(self, meeting): """Validate meeting object. Args: meeting (DotDict): Meeting object to validate. Raises: ValidationError: Invalid meeting. ValidationWarning: Description """ if 'kind' in meeting and meeting.kind != 'meeting': raise ValidationError(meeting, 'meeting object must be kind instructor') if ('course' in meeting and self.course_code_regex.match(meeting.course.code) is None): raise ValidationError( meeting, 'course code {} does not match regex \'{}\''.format( meeting.course.code, self.config.course_code_regex ) ) if 'time' in meeting: try: self.validate_time_range(meeting.time.start, meeting.time.end) except (ValidationError, ValidationWarning) as e: message = 'meeting for {} {}, '.format( meeting.course.code, meeting.section.code ) if isinstance(e, ValidationError): raise ValidationError(message, *e.args) raise ValidationWarning(message, *e.args) if 'location' in meeting: try: self.validate_location(meeting.location) except ValidationError as e: message = 'meeting for {} {}, '.format( meeting.course.code, meeting.section.code ) raise ValidationError(message, *e.args) if not self.relative: return if 'course' in meeting and meeting.course.code not in self.seen and self.transaction is None: raise ValidationError( meeting, 'course code {} isnt defined'.format(meeting.course.code) ) if 'section' not in meeting: return if (meeting.section.code, meeting.section.year, meeting.section.term) not in self.seen.get(meeting.course.code, set()) | self.transaction.values: raise ValidationError( meeting, 'section {} isnt defined'.format(meeting.section.code) )
[docs] def validate_eval(self, course_eval): """Validate evaluation object. Args: course_eval (DotDict): Evaluation to validate. Raises: ValidationError: Invalid evaulation. """ if self.course_code_regex.match(course_eval.course.code) is None: raise ValidationError( course_eval, "course code {} does not match r'{}'".format( course_eval.course.code, self.config.course_code_regex ) )
[docs] def validate_instructor(self, instructor): """Validate instructor object. Args: instructor (DotDict): Instructor object to validate. Raises: ValidationError: Invalid instructor. """ if 'kind' in instructor and instructor.kind != 'instructor': raise ValidationError( instructor, 'instructor object must be of kind instructor' ) for class_ in instructor.get('classes', []): if ('course' in class_ and self.course_code_regex.match(class_.course.code) is None): raise ValidationError( instructor, 'course code {} does not match given regex {}'.format( class_.course.code, self.config.course_code_regex ) ) if 'department' in instructor and 'departments' in self.config: dept_codes = {d.code for d in self.config.departments} if instructor.department not in dept_codes: raise ValidationError( instructor, 'department {} not listed in config.json'.format( instructor.department ) ) if 'homepage' in instructor: try: self.validate_homepage(instructor.homepage) except ValidationError as e: message = 'instructor {} office, {}'.format(instructor.name) raise ValidationError(message, *e.args) if 'office' in instructor: try: if 'location' in instructor.office: self.validate_location(instructor.office.location) for office_hour in instructor.office.get('hours', []): self.validate_meeting(office_hour) except ValidationError as e: message = 'instructor {} office, {}'.format(instructor.name) raise ValidationError(message, *e.args)
[docs] def validate_final_exam(self, final_exam): """Validate final exam. NOTE: currently unused. Args: final_exam (DotDict): Final Exam object to validate. Raises: ValidationError: Invalid final exam. """ if 'kind' in final_exam and final_exam.kind != 'final_exam': raise ValidationError( final_exam, 'final_exam object must be of kind "final_exam"' ) try: self.validate_meeting(final_exam.meeting) except ValidationError as e: raise ValidationError(final_exam, *e.args)
[docs] def validate_location(self, location): """Validate location. Args: location (DotDict): Location object to validate. Raises: ValidationWarning: Invalid location. """ if 'campus' in location and 'campuses' in self.config: if location.campus not in self.config.campuses: raise ValidationWarning( location, 'campus {} not in config'.format(location.campus), ) if 'building' in location and 'buildings' in self.config: if location.building not in self.config.buildings: raise ValidationWarning( location, 'building {} not in config'.format(location.building), )
@staticmethod
[docs] def validate_website(url): """Validate url by sending HEAD request and analyzing response. Args: url (str): URL to validate. Raises: ValidationError: URL is invalid. """ c = httplib.HTTPConnection(url) c.request('HEAD', '') # NOTE: 200 - good status # 301 - redirected if c.getresponse().status == 200 or c.getresponse().status == 301: return raise ValidationError(url, 'invalid website w/url "%s"'.format(url))
[docs] def validate_time_range(self, start, end): """Validate start time and end time. There exists an unhandled case if the end time is midnight. Args: start (str): Start time. end (str): End time. Raises: ValidationError: Time range is invalid. """ try: start, end = map(dparser.parse, [start, end]) except ValueError: raise ValidationError('invalid time format {}-{}'.format(start, end)) if start > end: raise ValidationError('start {} > end {}'.format(start, end)) elif start == end: pass # TODO - this should be reported
# raise ValidationWarning('start {} = end {}'.format(start, end)) # NOTE: there exists an unhandled case if the end time is midnight.
[docs] def validate_directory(self, directory): """Validate directory. Args: directory (str, dict): Directory to validate. May be either path or object. Raises: ValidationError: encapsulated IOError """ if isinstance(directory, str): try: name = directory directory = dir_to_dict(directory) directory['name'] = name except IOError as e: raise ValidationError(str(e)) Validator.schema_validate(directory, *Validator.SCHEMAS.directory)