
# Data and computations for paper 

Lars G. Johnsen
National Library of Norway

In [138]:
import dhlab.nbtext as nb
import dhlab.module_update as mu
mu.update('collocations')
from collocations import dist_coll_urn
import pandas as pd

Updated file `D:\Documents\GitHub\newspapers_coll_conc\collocations.py`

In [139]:
def df_jaccard(df, col1, col2, number=100, asc=False):
    s1 = df.sort_values(by=col1, ascending=asc)[:number].index
    s2 = df.sort_values(by=col2, ascending=asc)[:number].index
    
    #print(len(set(s1)&set(s2)))
    
    return jaccard(s1, s2)

In [140]:
def jaccard(s1, s2):
    return len(set(s1)&set(s2))/len(set(s1)|set(s2))

In [141]:
def make_coll_df(small, large, tot):
    """take two collocations, small and large, and a reference tot. The reference must av a column call tot as well"""
    coll = pd.DataFrame()
    coll['small'] = small['freq']
    coll['large'] = large['freq']
    #coll['ratio'] = coll['small']/coll['large']
    coll['srel'] = coll.small/coll.small.sum()
    coll['lrel'] = coll.large/coll.large.sum()
    coll['ratio'] = coll.srel/coll.lrel
    coll['nb'] = coll.small/tot.tot
    return coll

### Reference corpus

The reference is the total counts from approximately 450 000 books from nb.no

In [142]:
tot = nb.frame(nb.totals(50000), 'tot')

In [143]:
nb.normalize_corpus_dataframe(tot)
tot.head()

Unnamed: 0,tot
.,0.058921
",",0.051453
og,0.025297
i,0.021425
det,0.012728


# Corpus

The corpus for doing collocations is a sample of 800 books from fictional literature, dewey decimal code 813.

In [144]:
corpus = nb.book_corpus(ddk="813%", period=(1980, 2000), limit = 5000)

In [145]:
corpus

Unnamed: 0,urn,author,title,year
0,2007112601038,"Conroy, Pat",Tidevannets fyrste,1992
1,2009070200063,"Garlock, Dorothy",Hjertets stemme,1993
2,2008030300123,"Morrell, David",Tilbake til livet,1995
3,2008090100099,"William, Kate",Bitre rivaler,1999
4,2016061508057,"Stout, Rex",Sin fars hus,1980
...,...,...,...,...
4995,2009072100092,"Dixon, Franklin W.",Hardyguttene løser kodemysteriet,1991
4996,2015030606048,"Dickson, Carter",Bøddelen går igjen,1985
4997,2009011500040,"Stine, R.L.",Hvordan jeg lærte å fly,2000
4998,2008011601039,"Chandler, Raymond",Lillesøster,1994


Set up the distance parameters and collword, smd is small distance while lmd is large distance. These values are half of the actual window, and used to make a normalized score, call ascore. See below.

In [146]:
smd = 5
lmd = 10
collword = 'spise'

In [147]:
def large_corpus_coll(collword, urns = None, after= 5, before = 5, n = 300):
    colls = []
    for i in range(0, len(urns), n):
        colls.append(nb.urn_coll(collword, urns=urns[i:i + n], after= after, before = before))
    coll = pd.concat(colls, axis=1, sort=False).sum(axis=1)
    return pd.DataFrame(coll)

In [None]:
a1 = nb.frame(large_corpus_coll(collword, urns=list(corpus.urn), after= int(2*smd), before = 0), 'freq')
a2 = nb.frame(large_corpus_coll(collword, urns=list(corpus.urn), after= int(2*lmd), before = 0), 'freq')

In [None]:
b1 = nb.frame(large_corpus_coll(collword, urns=list(corpus.urn), after=0, before = int(2*smd)), 'freq')
b2 = nb.frame(large_corpus_coll(collword, urns=list(corpus.urn), after=0, before = int(2*lmd)), 'freq')

## Create collocation dataframe 

Based on data from a1 and a2, and b1 and b2. The name for the collocations after is `coll` while `collb` is for the collocates coming before.

In [None]:
coll = make_coll_df(a1, a2, tot)
collb = make_coll_df(b1, b2, tot)

## Sorting 

Sorting on the reference is by column 'nb'

In [None]:
coll[coll.small > 1].sort_values(by='nb', ascending=False)[:20][:20].fillna(0).style.background_gradient()

In [None]:
coll[coll.small < coll.large].sort_values(by='ratio', ascending=False)[:20].fillna(0).style.background_gradient()

# collb

In [None]:
collb[collb.small < collb.large].sort_values(by='ratio', ascending=False)[:40].fillna(0).style.background_gradient()

In [None]:
collb.sort_values(by='nb', ascending=False)[:40].fillna(0).style.background_gradient()

# coll

In [None]:
coll.sort_values(by='ratio', ascending=False)[:10].fillna(0).style.background_gradient()

In [None]:
coll.sort_values(by='nb', ascending=False)[['nb', 'mass_dist','combo']][:10].fillna(0).style.background_gradient()

# jaccard similarity

# for collb

In [None]:
jaccard_scoresb = nb.frame({'ratio': {x:df_jaccard(collb, 'nb', 'ratio', x) for x in range(5, 210, 5)},
                            'srel': {x:df_jaccard(collb, 'nb', 'srel', x) for x in range(5, 210, 5)},
                           'lrel': {x:df_jaccard(collb, 'nb', 'lrel', x) for x in range(5, 210, 5)}
                          }).transpose()

In [None]:
jaccard_scoresb.plot(title='Jaccard similarity of reference corpus');

# for coll

In [None]:
jaccard_scores = nb.frame({'ratio': {x:df_jaccard(coll, 'nb', 'ratio', x) for x in range(5, 210, 5)},
                            'srel': {x:df_jaccard(coll, 'nb', 'srel', x) for x in range(5, 210, 5)},
                           'lrel': {x:df_jaccard(coll, 'nb', 'lrel', x) for x in range(5, 210, 5)}
                          }).transpose()

In [None]:
jaccard_scores.plot();