Source code for keywords

# -*- coding: utf-8 -*-

'''
Module to compute keywords (words that are used significantly more frequently
in one corpus than they are in a reference corpus).
'''

import os
from math import log1p
import operator

from cheshire3.document import StringDocument
from cheshire3.internal import cheshire3Root
from cheshire3.server import SimpleServer
from cheshire3.baseObjects import Session

cheshirePath = os.path.join('HOME', '/home/cheshire')

[docs]class Keywords(object): ''' Class to compute keywords based on an test index (the corpus of analysis), a reference index (the corpus of reference), and a P value. ''' def __init__(self): ''' Make the connection with cheshire3. ''' self.session = Session() self.session.database = 'db_dickens' self.serv = SimpleServer(self.session, os.path.join(cheshire3Root, 'configs', 'serverConfig.xml') ) self.db = self.serv.get_object(self.session, self.session.database) self.qf = self.db.get_object(self.session, 'defaultQueryFactory') self.resultSetStore = self.db.get_object(self.session, 'resultSetStore') self.idxStore = self.db.get_object(self.session, 'indexStore') self.logger = self.db.get_object(self.session, 'keywordLogger')
[docs] def list_keywords(self, testIdxName, testMaterials, refIdxName, refMaterials, pValue): ''' Return a sorted list of keywords. Limited to the first 5000 items. ''' #self.logger.log(10, 'CREATING KEYWORDS FOR RS: {0} in {1}, compared to {2} in {3}'.format(testIdxName, testMaterials, refIdxName, refMaterials, pValue)) session = self.session db = self.db clauses = [] for testMaterial in testMaterials: if testMaterial in ['dickens', 'ntc']: testMatIdx = 'subCorpus-idx' else: testMatIdx = 'book-idx' clauses.append('c3.{0} = "{1}"'.format(testMatIdx, testMaterial)) test_query = self.qf.get_query(session, ' or '.join(clauses) ) test_results = db.search(session, test_query) test_idx = db.get_object(session, testIdxName) test_facets = test_idx.facets(session, test_results) ## create dictionary containing word/cluster and number of occurrences test_dict = {x[0]: x[1][2] for x in test_facets} # Reference results clauses_ref = [] for refMaterial in refMaterials: if refMaterial in ['dickens', 'ntc']: refMatIdx = 'subCorpus-idx' else: refMatIdx = 'book-idx' clauses_ref.append('c3.{0} = "{1}"'.format(refMatIdx, refMaterial)) ref_query = self.qf.get_query(session, ' or '.join(clauses_ref) ) ref_results = db.search(session, ref_query) ref_idx = db.get_object(session, refIdxName) ref_facets = ref_idx.facets(session, ref_results) ref_dict = {x[0]: x[1][2] for x in ref_facets} ## get test and ref lengths ## I use total counts to calculate expected values testLength = sum(test_dict.values()) refLength = sum(ref_dict.values()) kw_list = [] for term, freqTest in test_dict.iteritems(): if freqTest > 1: try: ## Method 1: how many observations of a given word is found in ref corpus but not in test corpus ## Subtract number of occurrences in testIndex from number of occurrences in sentences #freqRef = float(ref_dict[term] - freqTest) ## Method 2: treat groups as mutually exclusive. NOTE: When comparing quotes with whole text the occurrences will overlap freqRef = float(ref_dict[term]) except KeyError: freqRef = 5.0e-324 else: if freqRef <= 0: freqRef = 5.0e-324 ## following Paul Ryson formula for log likelihood (http://ucrel.lancs.ac.uk/llwizard.html) ## 1. Expected occurrence within corpus ## 1a. Expected reference value: based on sentence index ## - Get the total N from corpus 1 (reference corpus) ## - Multiply by the sum of observations found in ref corpus and those found in test corpus ## - Divide by the sum of total N in test corpus and reference corpus expectedRef = refLength*(freqTest+freqRef)/(testLength+refLength) ## 1b. Expected test value ## Equivalent steps to 1a, but multiply by test N expectedTest = testLength*(freqTest+freqRef)/(testLength+refLength) ## 2. Log Likelihood ## Compare actual observations with expected ocurrence for both test and ref, and add these values ## Use log1p() (for natural logarithm - ln) instead of log() if freqTest*log1p(freqTest/expectedTest) >= freqRef*log1p(freqRef/expectedRef): try: LL = 2*((freqTest*log1p(freqTest/expectedTest)) + (freqRef*log1p(freqRef/expectedRef))) LL = '%.2f' % LL except: LL = 909090 else: try: LL = -2*((freqTest*log1p(freqTest/expectedTest)) + (freqRef*log1p(freqRef/expectedRef))) LL = '%.2f' % LL except: LL = 909090 if freqRef == 5.0e-324: freqRef2 = 0 else: freqRef2 = int('%.0f' % freqRef) dec_Test = '%.2f' % freqTest dec_Ref = '%.2f' % freqRef propTest = (float(dec_Test)/testLength) * 100 propRef = (float(dec_Ref)/refLength) * 100 if float(pValue) == 0.000001: if float(LL) >= 23.93:# or float(LL) <= -23.93: ## We only deal with positive LL values kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) else: if float(pValue) == 0.0000001: if float(LL) >= 28.38: kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) if float(pValue) == 0.00000001: if float(LL) >= 32.85: kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) if float(pValue) == 0.000000001: if float(LL) >= 37.33: kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) if float(pValue) == 0.0000000001: if float(LL) >= 41.83: kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) if float(pValue) == 0.00000000001: if float(LL) >= 46.33: kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) if float(pValue) == 0.00001: if (float(LL) > 19.52):# or (float(LL) < -19.52): kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) elif float(pValue) == 0.0001: if (float(LL) > 15.14):# or (float(LL) < -15.14): kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) elif float(pValue) == 0.001: if (float(LL) > 10.83):# or (float(LL) < -10.83): kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) elif float(pValue) == 0.01: if (float(LL) > 6.64):# or (float(LL) < -6.64): kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) elif float(pValue) == 0.05: if (float(LL) > 3.85):# or (float(LL) < -3.85): kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) elif float(pValue) == 0.1: ## NB: returns all values if (float(LL) > 2.71):# or (float(LL) < -2.71): kw_list.append(['', term, str(freqTest), '%.2f' % propTest, str(freqRef2), '%.2f' % propRef, float(LL), pValue]) ## sort by K value (descending) kw_list.sort(key=operator.itemgetter(6), reverse=True) ## reverse=TRUE for descending order return kw_list[0:4999]