Source code for searches.utils

# Copyright (C) 2017 Semester.ly Technologies, LLC
#
# Semester.ly is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Semester.ly is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.


from __future__ import division
import cPickle as pickle
import numpy as np
import operator
from sklearn.feature_extraction.text import TfidfTransformer
from django.db.models import Q
from timetable.models import Course
from nltk.stem.porter import *
import progressbar





[docs]def course_desc_contains_token(token): """Returns a query set of courses where tokens are contained in descriptions.""" return Q(description__icontains=token)
[docs]def course_name_contains_token(token): """Returns a query set of courses where tokens are contained in code or name.""" return (Q(code__icontains=token) | Q(name__icontains=token.replace("&", "and")) | Q(name__icontains=token.replace("and", "&")))
[docs]class Vectorizer: """ Vectorizer class creates a dictionary over courses and build course vectors using count vectorizer.""" def __init__(self): self.TITLE_WEIGHT = 3 self.stemmer = PorterStemmer()
[docs] def vectorize(self): """Vectorize function transforms and saves entire course objects into course vectors using TF-IDF.""" raw_word_counts = [] bar = progressbar.ProgressBar(max_value=Course.objects.count()) print("Stringifying all courses for vectorization...") for current_count, course in enumerate(Course.objects.all().iterator()): raw_word_counts.append(self.course_to_str(course.name, course.description, course.areas, self.TITLE_WEIGHT)) bar.update(current_count) print("Transforming all courses into vectors...") with open('searches/dictionary.pickle', 'r') as handle: count_vectorizer = pickle.load(handle) processed_word_counts = count_vectorizer.transform(raw_word_counts) tfidf_tf = TfidfTransformer(use_idf=True).fit(processed_word_counts) course_vectors = tfidf_tf.transform(processed_word_counts) bar.update(0) print("Saving all course vectors...") # save course vector to model. for current_count, course in enumerate(Course.objects.all().iterator()): course.vector = course_vectors[current_count] course.save() bar.update(current_count)
[docs] def course_to_str(self, name, description, area, weight): """Returns a string representation of a course using a Porter Stemmer.""" stemmed_doc = "" if name: name_doc = name.encode('ascii', 'ignore') stemmed_name_doc = self.doc_to_lower_stem_str(name_doc) stemmed_doc += (' ' + stemmed_name_doc) * weight + " " if description: desc_doc = description.encode('ascii', 'ignore') stemmed_doc += self.doc_to_lower_stem_str(desc_doc) if area: area_doc = area.encode('ascii', 'ignore') stemmed_doc += self.doc_to_lower_stem_str(area_doc) return stemmed_doc
[docs] def doc_to_lower_stem_str(self, doc): """Converts words in document(string) to lowercase, stemmed words.""" return ' '.join([self.stemmer.stem(w.lower()) for w in doc.split(' ')])
[docs]class Searcher: """ Searcher class implements baseline search and vectorized search based on information retrieval techniques. """ def __init__(self): self.vectorizer = Vectorizer() self.count_vectorizer = self.load_count_vectorizer() self.MAX_CAPACITY = 300 self.start_time = 0
[docs] def load_count_vectorizer(self): """Loads english dictionary count vectorizer pickle object.""" with open('searches/dictionary.pickle', 'r') as handle: return pickle.load(handle)
[docs] def vectorize_query(self, query): """Vectorizes a user's query using count vectorizer.""" stemmed_qry = self.vectorizer.doc_to_lower_stem_str(query) query_vector = self.count_vectorizer.transform([stemmed_qry]) return query_vector
[docs] def get_acronym(self, name): """Returns an acronym of a course name.""" name = name.replace("and", "").replace("&", "").lower() return ''.join([i[0] for i in name.split(' ')])
[docs] def matches_name(self, query, course_name): """Returns a score (2, 1, 0) of a query match to course name.""" query_tokens = query.strip().lower().split(' ') course_name = course_name.lower() title_contains_query = all(map(lambda q: q in course_name, query_tokens)) if title_contains_query and len(query_tokens) is len(course_name.split()): return 2 elif title_contains_query: return 1 else: return 0
[docs] def get_cosine_sim(self, sparse_vec1, sparse_vec2): """Computes cosine similarity between two sparse vectors.""" if sparse_vec1 is not None and sparse_vec2 is not None: return np.sum(sparse_vec1.multiply(sparse_vec2)) else: return 0
[docs] def get_similarity(self, query, course): """Vectorizes query and returns a cosine similarity score between query and course vector.""" query_vector = self.vectorize_query(query.lower()) return self.get_cosine_sim(query_vector, course.vector)
[docs] def get_most_relevant_filtered_courses(self, query, course_filtered): """Returns the most relevant filtered courses given a query from filtered course objects.""" query_vector = self.vectorize_query(query.lower()) return sorted(course_filtered, key=lambda course: -self.get_score(course, query, query_vector))
[docs] def get_score(self, course, query, query_vector): """Computes similarity score based on cosine similarity and match between query and course name.""" return self.get_cosine_sim(query_vector, course.vector) + self.matches_name(query, course.name)
[docs] def wordify(self, course_vector): """Converts a course vector back into string using count vectorizer.""" print(self.count_vectorizer.inverse_transform(course_vector))
[docs] def print_similiarity_scores(self, courses, query): """Prints all course similarity scores given a query (for debugging).""" query_vector = self.vectorize_query(query.lower()) for course in sorted(courses, key=lambda course: -self.get_score(course, query, query_vector))[:10]: print(course.name + ":" + str(self.get_score(course, query, query_vector)) + "\n")