{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "\n", "# Data and computations for paper \n", "\n", "Lars G. Johnsen\n", "National Library of Norway" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/markdown": [ "Updated file `D:\\documents\\GitHub\\newspapers_coll_conc\\collocations.py`" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import dhlab.nbtext as nb\n", "import dhlab.module_update as mu\n", "mu.update('collocations')\n", "from collocations import dist_coll_urn, urn_coll, calculate_midpoint, dist\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "def df_jaccard(df, col1, col2, number=100, asc=False):\n", " s1 = df.sort_values(by=col1, ascending=asc)[:number].index\n", " s2 = df.sort_values(by=col2, ascending=asc)[:number].index\n", " \n", " #print(len(set(s1)&set(s2)))\n", " \n", " return jaccard(s1, s2)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "movealpha = lambda x: list(tuple([x[-1]]) + tuple(x[:-1]))" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def jaccards(s1, s2):\n", " return len(set(s1)&set(s2)),len(set(s1)|set(s2))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def jaccard(s1, s2):\n", " return len(set(s1)&set(s2))/len(set(s1)|set(s2))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def make_dcoll_df(dists, norm, tot, exp=0):\n", " \n", " coll = nb.frame(\n", " {\n", " 'freq':dists['freq'].astype(int),\n", " 'score':(dists.freq/dists.freq.sum())**exp*abs(norm/dists['2']), \n", " 'dist':dists['1'],\n", " 'dist_' : dists['2'],\n", " 'reference':(dists.freq/dists.freq.sum())/tot.tot\n", " }\n", " ).transpose()\n", " coll = coll.fillna(0)\n", " return coll.astype({'freq':'int32'})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Reference corpus\n", "\n", "The reference is the total counts from approximately 450 000 books from nb.no" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "tot = nb.frame(nb.totals(50000), 'tot')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
tot
.0.058921
,0.051453
og0.025297
i0.021425
det0.012728
\n", "
" ], "text/plain": [ " tot\n", ". 0.058921\n", ", 0.051453\n", "og 0.025297\n", "i 0.021425\n", "det 0.012728" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "nb.normalize_corpus_dataframe(tot)\n", "tot.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Corpus\n", "\n", "The corpus for doing collocations is a sample of 800 books from fictional literature, dewey decimal code 813." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "smd = 5\n", "lmd = 10\n", "collword = 'kaffe'" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "def filenames(smd = smd, lmd = lmd, collword = collword, corpus_size= 1000, period = (1980, 2000)):\n", " return {\n", " 'small_right': '_'.join([collword, str(smd), str(corpus_size), '-'.join([str(x) for x in period]), str(0), str(int(2*smd)), '.csv']),\n", " 'large_right': '_'.join([collword, str(smd), str(corpus_size), '-'.join([str(x) for x in period]), str(0), str(int(2*lmd)), '.csv']),\n", " 'small_left' : '_'.join([collword, str(lmd), str(corpus_size), '-'.join([str(x) for x in period]), str(int(2*smd)), str(0), '.csv']),\n", " 'large_left' : '_'.join([collword, str(lmd), str(corpus_size), '-'.join([str(x) for x in period]), str(int(2*lmd)), str(0), '.csv'])\n", " }" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Store the data subsequent use" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'small_right': 'kaffe_5_1000_1980-2000_0_10_.csv',\n", " 'large_right': 'kaffe_5_1000_1980-2000_0_20_.csv',\n", " 'small_left': 'kaffe_10_1000_1980-2000_10_0_.csv',\n", " 'large_left': 'kaffe_10_1000_1980-2000_20_0_.csv'}" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "names = filenames()\n", "names" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "a1 = pd.read_csv(names['small_right'], index_col=0)\n", "a2 = pd.read_csv(names['large_right'], index_col=0)\n", "b1 = pd.read_csv(names['small_left'], index_col=0)\n", "b2 = pd.read_csv(names['large_left'], index_col=0)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Create collocation dataframe \n", "\n", "Based on data from a1 and a2, and b1 and b2. The name for the collocations after is `coll` while `collb` is for the collocates coming before." ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "coll = make_dcoll_df(a1, lmd, tot)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "collb = make_dcoll_df(b1, lmd, tot)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Sorting \n", "\n", "Sorting on the reference is by column 'nb'" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
freq score dist dist_ reference
te2373.521132.830732.8431.707
vafler133.424662.708332.92216.32
kaker923.3672.940982.97255.776
termos113.278692.805563.050
kakao633.205133.086623.12441.183
senga133.205132.916673.1215.2036
røkte123.1152633.2158.61
olje173.105593.077273.229.69598
smørbrød263.076923.157413.25147.56
eller2153.012053.313263.321.44512
rundstykker143.0033.166673.33248.224
først182.967363.246033.371.05339
kjelen222.941183.301593.476.456
pr.192.801123.466673.572.3908
mineralvann132.762433.458333.62170.416
kjeks162.754823.504763.6385.0343
bananer332.739733.59633.65299.242
wienerbrød182.680973.628573.730
kake192.64553.688893.7839.1793
prate112.638523.616673.7919.4529
" ], "text/plain": [ "" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coll[coll.freq > 10].sort_values(by='score', ascending=False)[:20][:20].fillna(0).style.background_gradient()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
freq score dist dist_ reference
kakao633.205133.086623.12441.183
drikkes221.602566.277786.24301.237
bananer332.739733.59633.65299.242
koppene192.02844.898814.93262.334
kaker923.3672.940982.97255.776
rundstykker143.0033.166673.33248.224
vafler133.424662.708332.92216.32
konjakk372.427184.085194.12204.488
kannen112.469143.94.05198.144
Kaffen102.227174.3754.49180.986
bomull502.267574.383384.41173.405
mineralvann132.762433.458333.62170.416
koppen411.80185.546395.55150.341
smørbrød263.076923.157413.25147.56
tobakk532.304154.318784.34132.894
sirup161.919395.188895.21128.614
kanna102.380954.055564.2113.938
fløte372.604173.79633.84109.894
sukker1692.202644.533854.54108.422
Viktigste101.980255.05102.048
" ], "text/plain": [ "" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coll.sort_values(by='reference', ascending=False)[:20].fillna(0).style.background_gradient()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
freqscoredistdist_reference
,32152.2271714.4936064.491.153050
.39642.0533884.8659044.871.241492
og27532.5510203.9196263.922.008213
eller2153.0120483.3132583.321.445120
i9801.8867925.3044785.300.844082
8471.7889095.5851985.591.629991
ned581.6313216.1439956.131.674834
\n", "
" ], "text/plain": [ " freq score dist dist_ reference\n", ", 3215 2.227171 4.493606 4.49 1.153050\n", ". 3964 2.053388 4.865904 4.87 1.241492\n", "og 2753 2.551020 3.919626 3.92 2.008213\n", "eller 215 3.012048 3.313258 3.32 1.445120\n", "i 980 1.886792 5.304478 5.30 0.844082\n", "på 847 1.788909 5.585198 5.59 1.629991\n", "ned 58 1.631321 6.143995 6.13 1.674834" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coll.loc[[',','.', 'og','eller', 'i', 'på','ned']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# collb" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
freq score dist dist_ reference
kopp8507.5188-1.32574-1.331119.41
kanne585.61798-1.70979-1.78581.517
skjenket1633.64964-2.71856-2.74351.469
mineralvann253.40136-2.83333-2.94327.723
rykende374.46429-2.14993-2.24257.746
Viktigste251.66667-6.01587-6255.119
kakao352.6738-3.68642-3.74245.102
serverte434.42478-2.18148-2.26240.712
kopper874.4843-2.18757-2.23220.739
vafler133.20513-2.91667-3.12216.32
drakk3265.68182-1.74805-1.76216.31
kop147.57576-1-1.32212.987
rundstykker122.20264-4.45556-4.54212.763
bananer221.96078-5.07778-5.1199.495
skjenker213.663-2.58889-2.73186.168
krus303.84615-2.4963-2.6185.665
drikker1504.80769-2.05629-2.08168.802
bestilte562.7933-3.5459-3.58164.847
spanderte94.2735-1.94444-2.34160.89
Eksport131.98413-5-5.04152.476
" ], "text/plain": [ "" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "collb.sort_values(by='reference', ascending=False)[:20].fillna(0).style.background_gradient()" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "collb = collb.drop('kop')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
freq score dist dist_ reference
kopp8507.5188-1.32574-1.331119.41
slurk496.99301-1.34286-1.43118.642
svart1076.80272-1.42884-1.4732.0186
drakk3265.68182-1.74805-1.76216.31
koke675.64972-1.71567-1.77120.784
kanne585.61798-1.70979-1.78581.517
kokt215.40541-1.66667-1.8536.7442
kokte645.37634-1.80331-1.86123.787
sterk855.37634-1.81906-1.8615.8174
drikke2154.90196-2.02677-2.04102.011
drikker1504.80769-2.05629-2.08168.802
drukket714.71698-2.0707-2.12121.499
kopper874.4843-2.18757-2.23220.739
rykende374.46429-2.14993-2.24257.746
serverte434.42478-2.18148-2.26240.712
lage724.2735-2.29248-2.3416.3779
ny423.92157-2.4803-2.552.8695
krus303.84615-2.4963-2.6185.665
varm913.83142-2.58141-2.6139.3232
skjenker213.663-2.58889-2.73186.168
skjenket1633.64964-2.71856-2.74351.469
kilo243.59712-2.66667-2.7831.99
koker253.58423-2.67857-2.7993.5027
mer1683.50877-2.8337-2.853.67989
mineralvann253.40136-2.83333-2.94327.723
lager343.38983-2.86941-2.9515.7347
usøtet213.20513-3-3.120
hente433.10559-3.16389-3.2217.8665
helte353.10559-3.15224-3.22118.335
laget763.09598-3.19971-3.2313.7915
kjøpte233.08642-3.13333-3.248.9624
te1203.08642-3.22429-3.2416.0542
ha2523.003-3.31855-3.334.21966
mye542.98507-3.30804-3.352.57749
servert382.89855-3.39676-3.4597.8189
litt1802.86533-3.47969-3.496.84285
bomull222.85714-3.4-3.576.2981
eksportvarer252.84091-3.43452-3.520
bestilte562.7933-3.5459-3.58164.847
god532.78552-3.55011-3.593.14228
" ], "text/plain": [ "" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "collb[collb.freq > 20].sort_values(by='score', ascending=False)[:40].fillna(0).style.background_gradient()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
freqscoredistdist_reference
kakao352.673797-3.686420-3.74245.101930
kaffe1571.497006-6.683458-6.6887.791997
te1203.086420-3.224290-3.2416.054192
sjokolade192.267574-4.351852-4.4154.396609
buljong11.818182-6.000000-5.509.346958
\n", "
" ], "text/plain": [ " freq score dist dist_ reference\n", "kakao 35 2.673797 -3.686420 -3.74 245.101930\n", "kaffe 157 1.497006 -6.683458 -6.68 87.791997\n", "te 120 3.086420 -3.224290 -3.24 16.054192\n", "sjokolade 19 2.267574 -4.351852 -4.41 54.396609\n", "buljong 1 1.818182 -6.000000 -5.50 9.346958" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "collb.loc[[x for x in ['kakao', 'kaffe', 'te', 'sjokolade', 'latte', 'buljong'] if x in collb.index]]" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "common = coll.loc[[',','.', 'og','eller', 'i', 'på']]\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "common['translate'] = [',', '.', 'and', 'or', 'in', 'on']\n", "\n", "common = common[movealpha(common.columns)]\n", "\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
translatefreqscoredistdist_reference
,,32152.2271714.4936064.491.153050
..39642.0533884.8659044.871.241492
ogand27532.5510203.9196263.922.008213
elleror2153.0120483.3132583.321.445120
iin9801.8867925.3044785.300.844082
on8471.7889095.5851985.591.629991
\n", "
" ], "text/plain": [ " translate freq score dist dist_ reference\n", ", , 3215 2.227171 4.493606 4.49 1.153050\n", ". . 3964 2.053388 4.865904 4.87 1.241492\n", "og and 2753 2.551020 3.919626 3.92 2.008213\n", "eller or 215 3.012048 3.313258 3.32 1.445120\n", "i in 980 1.886792 5.304478 5.30 0.844082\n", "på on 847 1.788909 5.585198 5.59 1.629991" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "common" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "top10ratio = collb.sort_values(by='score', ascending=False)[:10]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['nytraktet', 'nytrukket', 'kopp', 'Svart', 'nykokt', 'slurk', 'pund',\n", " 'svart', 'Mer', 'nylaget'],\n", " dtype='object')" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top10ratio.index" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "top10ratio['translate'] = ['freshly drawn', 'freshly drawn', 'cup', 'black', 'freshly boiled', 'sip', 'pound',\n", " 'black', 'more', 'freshly made']" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "top10ratio = top10ratio[movealpha(top10ratio.columns)]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
translatefreqscoredistdist_reference
nytraktetfreshly drawn157.692308-1.000000-1.300.000000
nytrukketfreshly drawn147.575758-1.000000-1.320.000000
koppcup8507.518797-1.325735-1.331119.407652
Svartblack127.246377-1.000000-1.3859.552151
nykoktfreshly boiled127.246377-1.000000-1.380.000000
slurksip496.993007-1.342857-1.43118.641838
pundpound136.944444-1.100000-1.4417.839419
svartblack1076.802721-1.428837-1.4732.018644
Mermore96.666667-1.000000-1.506.771161
nylagetfreshly made96.666667-1.000000-1.500.000000
\n", "
" ], "text/plain": [ " translate freq score dist dist_ reference\n", "nytraktet freshly drawn 15 7.692308 -1.000000 -1.30 0.000000\n", "nytrukket freshly drawn 14 7.575758 -1.000000 -1.32 0.000000\n", "kopp cup 850 7.518797 -1.325735 -1.33 1119.407652\n", "Svart black 12 7.246377 -1.000000 -1.38 59.552151\n", "nykokt freshly boiled 12 7.246377 -1.000000 -1.38 0.000000\n", "slurk sip 49 6.993007 -1.342857 -1.43 118.641838\n", "pund pound 13 6.944444 -1.100000 -1.44 17.839419\n", "svart black 107 6.802721 -1.428837 -1.47 32.018644\n", "Mer more 9 6.666667 -1.000000 -1.50 6.771161\n", "nylaget freshly made 9 6.666667 -1.000000 -1.50 0.000000" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top10ratio" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "top10ref = collb.sort_values(by='reference', ascending=False)[:10]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['kopp', 'kanne', 'skjenket', 'mineralvann', 'rykende', 'Viktigste',\n", " 'kakao', 'serverte', 'kopper', 'vafler'],\n", " dtype='object')" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top10ref.index" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "top10ref['translate'] = ['cup', 'jug', 'poured', 'mineral water', 'smoking', 'main',\n", " 'cocoa', 'served', 'cups', 'waffles']" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "top10ref = top10ref[movealpha(top10ref.columns)]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
translatefreqscoredistdist_reference
koppcup8507.518797-1.325735-1.331119.407652
kannejug585.617978-1.709788-1.78581.516609
skjenketpoured1633.649635-2.718562-2.74351.468955
mineralvannmineral water253.401361-2.833333-2.94327.723096
rykendesmoking374.464286-2.149929-2.24257.746331
Viktigstemain251.666667-6.015873-6.00255.118839
kakaococoa352.673797-3.686420-3.74245.101930
serverteserved434.424779-2.181481-2.26240.712230
koppercups874.484305-2.187566-2.23220.738775
vaflerwaffles133.205128-2.916667-3.12216.319529
\n", "
" ], "text/plain": [ " translate freq score dist dist_ reference\n", "kopp cup 850 7.518797 -1.325735 -1.33 1119.407652\n", "kanne jug 58 5.617978 -1.709788 -1.78 581.516609\n", "skjenket poured 163 3.649635 -2.718562 -2.74 351.468955\n", "mineralvann mineral water 25 3.401361 -2.833333 -2.94 327.723096\n", "rykende smoking 37 4.464286 -2.149929 -2.24 257.746331\n", "Viktigste main 25 1.666667 -6.015873 -6.00 255.118839\n", "kakao cocoa 35 2.673797 -3.686420 -3.74 245.101930\n", "serverte served 43 4.424779 -2.181481 -2.26 240.712230\n", "kopper cups 87 4.484305 -2.187566 -2.23 220.738775\n", "vafler waffles 13 3.205128 -2.916667 -3.12 216.319529" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top10ref" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# coll" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# jaccard similarity\n", "\n", "compare over a range" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "rng = range(2,40,2)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# for collb" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "jaccard_scoresb = nb.frame({'ratio': {x:df_jaccard(collb[collb.freq > 20], 'reference', 'score', x) for x in rng}\n", " }).transpose()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'Jaccard score')" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "ax = jaccard_scoresb['ratio'].plot(title='compared to reference corpus');\n", "ax.set_xlabel(\"Number of words\")\n", "ax.set_ylabel(\"Jaccard score\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# for coll" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "jaccard_scores = nb.frame({'ratio': {x:df_jaccard(coll, 'score', 'reference', x) for x in rng} }).transpose()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0, 0.5, 'Jaccard score')" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "ax = jaccard_scores['ratio'].plot(title='compared to reference corpus');\n", "ax.set_xlabel(\"Number of words\")\n", "ax.set_ylabel(\"Jaccard score\")" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
freqscoredistdist_reference
avec96.251.1111111.613.973378
\n", "
" ], "text/plain": [ " freq score dist dist_ reference\n", "avec 9 6.25 1.111111 1.6 13.973378" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coll.loc[coll[coll.score > 4].index].sort_values(by='score', ascending=False)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
freqscoredistdist_reference
kakao633.2051283.0866163.12441.183475
drikkes221.6025646.2777786.24301.236739
bananer332.7397263.5962963.65299.241863
koppene192.0283984.8988104.93262.333714
kaker923.3670032.9409842.97255.775986
..................
sigarer22.2222223.5000004.5030.292974
bryderi21.3333339.5000007.5030.238671
mmol41.2903238.5000007.7530.183413
Middag52.0703934.6666674.8330.127670
servering72.2779044.2000004.3930.082467
\n", "

137 rows × 5 columns

\n", "
" ], "text/plain": [ " freq score dist dist_ reference\n", "kakao 63 3.205128 3.086616 3.12 441.183475\n", "drikkes 22 1.602564 6.277778 6.24 301.236739\n", "bananer 33 2.739726 3.596296 3.65 299.241863\n", "koppene 19 2.028398 4.898810 4.93 262.333714\n", "kaker 92 3.367003 2.940984 2.97 255.775986\n", "... ... ... ... ... ...\n", "sigarer 2 2.222222 3.500000 4.50 30.292974\n", "bryderi 2 1.333333 9.500000 7.50 30.238671\n", "mmol 4 1.290323 8.500000 7.75 30.183413\n", "Middag 5 2.070393 4.666667 4.83 30.127670\n", "servering 7 2.277904 4.200000 4.39 30.082467\n", "\n", "[137 rows x 5 columns]" ] }, "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ "coll.loc[coll[coll.reference> 30].index].sort_values(by='reference', ascending=False)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "ename": "AttributeError", "evalue": "'DataFrame' object has no attribute 'nb'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcoll\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcoll\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnb\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m30\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m&\u001b[0m \u001b[0mset\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcoll\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mcoll\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mscore\u001b[0m \u001b[1;33m>\u001b[0m \u001b[1;36m3.5\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[1;32mD:\\Anaconda3\\lib\\site-packages\\pandas\\core\\generic.py\u001b[0m in \u001b[0;36m__getattr__\u001b[1;34m(self, name)\u001b[0m\n\u001b[0;32m 5177\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_info_axis\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_can_hold_identifiers_and_holds_name\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5178\u001b[0m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 5179\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mobject\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__getattribute__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5180\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 5181\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m__setattr__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", "\u001b[1;31mAttributeError\u001b[0m: 'DataFrame' object has no attribute 'nb'" ] } ], "source": [ "set(coll[coll.nb > 30].index) & set(coll[coll.score > 3.5].index)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }