exp.freq.b<-construction1.freq-exp.freq.a
exp.freq.c<-(data[i,2]+data[i,3])*construction2.freq/(construction1.freq+construction2.freq); exp.freq.2[i]<-round(exp.freq.c, which.accuracy)
exp.freq.d<-construction2.freq-exp.freq.c
coll.strength[i]<-round(switch(which.index,
fye(obs.freq.a, exp.freq.a, construction1.freq, sum(construction1.freq, construction2.freq), sum(obs.freq.a, obs.freq.c)),
llr(obs.freq.a, obs.freq.b, obs.freq.c, obs.freq.d, exp.freq.a, exp.freq.b, exp.freq.c, exp.freq.d),
log(((obs.freq.a+0.5)/(obs.freq.b+0.5))/((obs.freq.c+0.5)/(obs.freq.d+0.5)), 10)), which.accuracy)
if (obs.freq.a>exp.freq.a) {
pref.occur[i]<-as.character(construction1.name)
} else if (obs.freq.a<exp.freq.a) {
pref.occur[i]<-as.character(construction2.name)
} else {
pref.occur[i]<-"no_preference"
}
delta.p.constr.to.word[i]<-round((obs.freq.a/(obs.freq.a+obs.freq.b))-(obs.freq.c/(obs.freq.c+obs.freq.d)), which.accuracy)
delta.p.word.to.constr[i]<-round((obs.freq.a/(obs.freq.a+obs.freq.c))-(obs.freq.b/(obs.freq.b+obs.freq.d)), which.accuracy)
overlap<-ifelse(all(obs.freq.a>0, obs.freq.c>0), overlap<-overlap+1, overlap)
}
output.table<-data.frame(words, obs.freq.1, obs.freq.2, exp.freq.1, exp.freq.2, pref.occur, delta.p.constr.to.word, delta.p.word.to.constr, coll.strength)
sort.index<-switch(which.sort, order(words), order(-obs.freq.1, words),order(-obs.freq.2, words), order(pref.occur, -coll.strength))
output.table<-as.data.frame(output.table[sort.index,])
cat("\a") # progress beep
# output
which.index<-switch(which.index, "-log10(Fisher-Yates exact, one-tailed)", "log-likelihood")
cat("\n")
if (which.output==1) {
cat("\nWhich text file do you want to store the result in?\n(Note: if you choose a file that already exists, the current output will be appended to this file.)\t"); pause()
output.file<-file.choose(); output<-file(output.file, open="at")
cat("|---------------------------------------------------------------------|\n| This output is provided without any warranty on an as-is basis by   |\n| Stefan Th. Gries <http://www.linguistics.ucsb.edu/faculty/stgries/> |\n| Please cite the program as mentioned in <readme.txt>. Thanks a lot! |\n|---------------------------------------------------------------------|\n\n", date(), file=output)
cat("\n\nDistinctive collocate/collexeme analysis for: ", as.character(construction1.name), " vs. ", as.character(construction2.name), "\n\nobs.freq.1: observed frequency of the word A-? in/with ", as.character(construction1.name), "\nobs.freq.2: observed frequency of the word A-? in/with ", as.character(construction2.name), "\nexp.freq.1: expected frequency of the word A-? in/with ", sep="", file=output)
cat(as.character(construction1.name), "\nexp.freq.2: expected frequency of the word A-? in/with ", as.character(construction2.name), "\npref.occur: the word/construction to which the word A-? is attracted\ndelta.p.constr.to.word: delta p: how much does the word/construction help guess the word?\ndelta.p.word.to.constr: delta p: how much does the construction help guess the word/construction?\ncoll.strength: index of distinctive collostructional strength:", which.index, ", the higher, the more distinctive\n\n", sep="", file=output)
write.table(output.table, file=output, quote=F, row.names=F, sep="\t", eol="\n")
cat("\nIf your collostruction strength is based on p-values, it can be interpreted as follows:\nColl.strength>3 => p<0.001; coll.strength>2 => p<0.01; coll.strength>1.30103 => p<0.05.\nOut of the ", cases, " investigated, ", overlap," collocates/collexemes are shared by both words/constructions; i.e. ", (overlap/cases*100), "%\n\n\nI'd be happy if you provided me with feedback and acknowledged the use of Coll.analysis 3.2a.\n", sep="", file=output)
close(output)
} else {
cat("|---------------------------------------------------------------------|\n| This output is provided without any warranty on an as-is basis by   |\n| Stefan Th. Gries <http://www.linguistics.ucsb.edu/faculty/stgries/> |\n| Please cite the program as mentioned in <readme.txt>. Thanks a lot! |\n|---------------------------------------------------------------------|\n\n", date(), "\n\nDistinctive collocate/collexeme analysis for: ")
cat(as.character(construction1.name), " vs. ", as.character(construction2.name), "\n\nobs.freq.1: observed frequency of the word A-? in/with ", as.character(construction1.name), "\nobs.freq.2: observed frequency of the word A-? in/with ", as.character(construction2.name), "\nexp.freq.1: expected frequency of the word A-? in/with ", as.character(construction1.name), "\nexp.freq.2: expected frequency of the word A-? in/with ")
cat(as.character(construction2.name), "\npref.occur: the word/construction to which the word A-? is attracted\ncoll.strength: index of distinctive collostructional strength: ", which.index, ", the higher, the more distinctive\n\n", sep="")
options(width=7500); print(output.table)
cat("\nIf your collostruction strength is based on p-values, it can be interpreted as follows:\nColl.strength>3 => p<0.001; coll.strength>2 => p<0.01; coll.strength>1.30103 => p<0.05.\nOut of the ", cases, " investigated, ", overlap," collocates/collexemes are shared by both words/constructions; i.e. ", (overlap/cases*100), "%\n\n\nI'd be happy if you provided me with feedback and acknowledged the use of Coll.analysis 3.2a.\n", sep="")
}
} else {
# introduction
cat("\nIn this case of multiple distinctive collexeme analysis, a more detailed introduction is necessary.\nIn regular collexeme analysis as well as distinctive collexeme analysis, we have always used the\none-tailed Fisher Yates exact test to compute the association strength between elements. As the name indicates,\nthis is an exact tests which is applied to 2-by-2 table and based on the hypergeometric distribution,")
cat("\ni.e., on sampling without replacement. If you want to perform a distinctive collexeme analysis with more\nthan two alternatives, e.g. English active vs. be-passive vs. get-passive, however,\n\nVOICE\t\tVERB\nactive\t\tthink\nbe-passive\ttell\nget-passive\tkill\n...\t\t...\n\nthen the Fisher-Yates exact test cannot be used anymore. ")
cat("The equivalent test for more than two alternatives\nis the so-called multinomial test, an exact test with sampling without replacement for 2+ alternatives.\nHowever, given the present purposes this test has two weaknesses:\n(i) it is computationally so expensive that sample sizes of several thousand items already exceed the capabilities of\nstate-of-the-art desktop computers in fall 2004, and ")
cat("(ii) the multinomial test only gives you a single\np-value and, thus, doesn't tell you where some deviation actually comes from: is an\noverall large deviation due to the low frequency for, say, _think_ in actives, or, say, the high frequency of,\nsay, _kill_ in get-passives? That is, even if the test was possible computationally,\nit would not yet answer the interesting questions.\n   ")
cat("Thus, this script uses an approximation to the multinomial test, namely the one-tailed exact binomial test.\nThis test is still an exact test, i.e., it is not sensitive to low frequencies. To use the above example,\nthe present implementation of the exact binomial test computes one p-value for each word in with\neach other word / in each construction (as in configural frequency analysis) and ")
cat("log-transforms it such that\nhighly positive and highly negative values indicate a large degree of attraction and repulsion respectively\nwhile 0 indicates random co-occurrence.\n   Then, to make the results more accessible, the script also outputs columns called SumAbsDev and LargestDev.\nAgain, using the above example, the former tells you for each verb the sum of all ")
cat("absolute log-transformed p-values, i.e.,\nhow strongly each verb's observed frequencies across all voices differ from the expected ones.\nThe latter tell you for each verb the single voice with the largest deviations from the expected frequencies.\n")
# input of parameters
cat("\nFor such a multiple distinctive collexeme analysis, Coll.analysis 3.2a expects as input\na file with a table of all tokens. That is, the first column contains for\neach co-occurrence item the code for one of the X words/constructions W/C\nyou want to investigate; the second column contains the word co-occurring with W/C")
cat("\nas listed in the first column.\n\nW/C\tColl_Word\nA\tX\nB\tY\nC\tZ\n...\t...\n\nYour file ideally has no spaces (use '_' instead) and don't forget that R's treatment of alphanumeric characters\nis case-sensitive! The computation of this analysis can require several minutes or even more time ...\n\nChoose the text file with the input data!\t"); pause()
mdca.data<-read.table(file.choose(), header=T, sep="\t", quote="", comment.char="")
names(mdca.data)<-c("W_C", "Coll_Word")
which.sort<-menu(choice=c("alphabetically (W_C)", "sum of absolute deviations per W_C", "W_Cs' largest deviation"), title="\nHow do you want to sort the output?")
# determine column frequencies
tab.mca.data<-table(mdca.data$Coll_Word, mdca.data$W_C) # generate table for multiple dca
colfreq<-table(mdca.data$W_C)
verb<-rownames(tab.mca.data); constr<-colnames(tab.mca.data)
n.verb<-length(verb); n.constr<-length(constr)
result.table<-data.frame(matrix(nrow=n.verb, ncol=(n.constr*3)+3))
colnames(result.table)<-c("Coll_Word", as.character(constr), paste("exp", as.character(constr), sep="_"), paste("pbin", as.character(constr), sep="_"), "SumAbsDev", "LargestDev")
result.table[,1]<-rownames(tab.mca.data)
result.table[,2:(n.constr+1)]<-tab.mca.data[,1:n.constr]
for (f in 1:n.verb) {
cur.obs<-tab.mca.data[f,]
cur.exp<-sum(cur.obs)*(colfreq/sum(colfreq))
result.table[f,(n.constr+2):(n.constr+n.constr+1)]<-round(cur.exp, which.accuracy)
counter<-0
for (g in (n.constr*2+2):(length(result.table)-2)) {
counter<-counter+1
if (cur.obs[counter]>=cur.exp[counter]) {
result.table[f,g]<-round(-log(sum(dbinom(cur.obs[counter]:sum(cur.obs), sum(cur.obs), (cur.exp[counter]/sum(cur.obs)))), 10), which.accuracy)
} else {
result.table[f,g]<-round(log(sum(dbinom(0:cur.obs[counter], sum(cur.obs), (cur.exp[counter]/sum(cur.obs)))), 10), which.accuracy)
}
}
result.table[f,length(result.table)-1]<-round(sum(abs(result.table[f,(length(names(result.table))-n.constr-1):(length(names(result.table))-2)])), which.accuracy)
largest.value<-round(max(abs(result.table[f,(length(result.table)-n.constr-1):(length(result.table)-2)])), which.accuracy)
largest.word<-as.character(constr[which(abs(result.table[f,(length(result.table)-n.constr-1):(length(result.table)-2)])==largest.value)])
if (length(largest.word)>1) { largest.word<-paste(largest.word, collapse="_&_") }
result.table[f,length(result.table)]<-largest.word
}
attach(result.table)
cat("\a") # progress beep
# output
which.output<-menu(choice=c("text file", "terminal"), title="Where do you want the output ('text file' will append to already existing file with the same name)?")
sort.index<-switch(which.sort, order(Coll_Word), order(-SumAbsDev, Coll_Word), order(LargestDev, -SumAbsDev))
result.table<-as.data.frame(result.table[sort.index,])
if (which.output==1) {
cat("\nWhich text file do you want to store the result in?\n(Note: if you choose a file that already exists, the current output will be appended to this file.)\t"); pause()
output.file<-file.choose(); output<-file(output.file, open="at")
cat("|---------------------------------------------------------------------|\n| This output is provided without any warranty on an as-is basis by   |\n| Stefan Th. Gries <http://www.linguistics.ucsb.edu/faculty/stgries/> |\n| Please cite the program as mentioned in <readme.txt>. Thanks a lot! |\n|---------------------------------------------------------------------|\n\n", date(), file=output)
cat("\n\nMultiple distinctive collocate/collexeme analysis for:", paste(as.character(constr), collapse=" "), "\n\nColl_Word: collocate of the words/constructions to be contrasted\nThe next ", paste(as.character(constr), collapse=" "), " columns are the words/constructions to be contrasted and their observed co-occurrence frequencies\nThe next ", paste(as.character(constr), collapse=" "), file=output)
cat(" columns are the words/constructions to be contrasted and their expected co-occurrence frequencies\nThe next ", paste(as.character(constr), collapse=" "), " columns are the log-transformed p-values of the words/constructions to be contrasted (+ = attraction, - = repulsion)\nSumAbsDev: the sum of the absolute values of the preceding ", n.constr, " columns: the larger, the stronger the deviation", file=output)
cat("\nLargestDev: the word/construction where the strongest deviation from observed to expected is found\n\n", sep="", file=output)
write.table(result.table, file=output, quote=F, row.names=F, sep="\t", eol="\n")
cat("\n\nSorting according to the 'pbin' columns will yield the most relevant outcomes for each word/construction.\npbin_*>3 => p<0.001; pbin_*>2 => p<0.01; pbin_*>1.30103 => p<0.05.\nI'd be happy if you provided me with feedback and acknowledged the use of Coll.analysis 3.2a.\n", file=output)
close(output)
} else {
cat("\n|---------------------------------------------------------------------|\n| This output is provided without any warranty on an as-is basis by   |\n| Stefan Th. Gries <http://www.linguistics.ucsb.edu/faculty/stgries/> |\n| Please cite the program as mentioned in <readme.txt>. Thanks a lot! |\n|---------------------------------------------------------------------|\n\n", date())
cat("\n\nMultiple distinctive collocate/collexeme analysis for: ", paste(as.character(constr), collapse=" "), "\n\nColl_Word: collocate of the words/constructions to be contrasted\nThe next", as.character(constr), "columns are the words/constructions to be contrasted and their observed co-occurrence frequencies\nThe next ", paste(as.character(constr), collapse=" "))
cat(" columns are the words/constructions to be contrasted and their expected co-co-occurrence frequencies\nThe next ", paste(as.character(constr), collapse=" "), " columns are the log-transformed p-values of the words/constructions to be contrasted (+ = attraction, - = repulsion)\nSumAbsDev: the sum of the absolute values of the preceding ", n.constr)
cat(" columns: the larger, the stronger the deviation\nLargestDev: the word/construction where the strongest deviation from observed to expected is found\n\n")
options(width=7500); print(result.table)
cat("\npbin_*>3 => p<0.001; pbin_*>2 => p<0.01; pbin_*>1.30103 => p<0.05.\nI'd be happy if you provided me with feedback and acknowledged the use of Coll.analysis 3.2a.\n")
}
}
} # END OF FUNCTION FOR DISTINCTIVE COLLEXEME ANALYSIS
covar.collexemes<-function() { # FUNCTION FOR CO-VARYING COLLEXEME ANALYSIS
cat("\nC o v a r y i n g   c o l l e x e m e   a n a l y s i s   . . .\n")
# introduction
cat("\n\nThis kind of analysis investigated dependencies within two slots of a single construction.\nThis script so far only implements the so-called item-based analysis since comparative studies\n have shown that the system-based correction may require many days computational time with only")
cat("\nminor differences in the results (cf. Stefanowitsch and Gries 2005). However, somewhere down the road I may find \ntime to work on an implementation of this technique so that arbitrarily many additional variables\n(e.g. register, corpora etc.) can be included.\n")
# input of parameters
cat("\nColl.analysis 3.2a requires as input for the item-based co-varying collexeme analysis:\na file with a table of all token instances of the construction C with\nthe two words W1 and W2 occurring in the slots of each instance of C.\n")
cat("That is, you need the following kind of input file (with column names!)),\nwhere the number of rows corresponds to the number of construction tokens you have.\n\nWord_Slot1\tWord_Slot2\nA\t\tX\nB\t\tX\n...\t...\n\n")
cat("Your file must not have decimal points/separators and ideally has no spaces (for the latter, use '_' instead)!\nAlso, don't forget that R's treatment of alphanumeric characters is case-sensitive!\n\n")
cat("\nWhat is the name of the construction C you investigate (without spaces)?\t")
construction.name<-scan(nmax=1, what="character", quiet=T)
if (length(construction.name)==0) construction.name<-"some_C"
which.combos<-menu(choice=c("all possible combinations (can be memory-intensive)", "only attested combinations (not memory-intensive at all)"), title="\nWhich combinations do you want to include?")
which.index<-menu(choice=c("-log10 (Fisher-Yates exact, one-tailed) (= default)", "log-likelihood", "log10 of odds ratio (adds 0.5 to each cell)"), title="\nWhich index of association strength do you want to compute?")
which.sort<-menu(choice=c("alphabetically (W1)", "alphabetically (W2)", "frequency (W1)", "frequency (w2)", "collostruction strength"), title="How do you want to sort the output?")
cat("\nEnter the number of decimals you'd like to see in the results (and '99', when you want the default output)!\t")
which.accuracy<-scan(nmax=1, quiet=T); cat("\n")
while (which.accuracy<=0) { cat("\nWith a value of 0 or smaller, the output might not be very meaningful - enter the correct number of decimals!\n"); which.accuracy<-scan(nmax=1, quiet=T) }
cat("\nChoose the text file with the raw data!\t"); pause()
data<-read.table(file.choose(), header=T, sep="\t", colClasses=c("character", "character"), quote="", comment.char="")
types.in.1<-sort(unique(data[,1])); ntypes.in.1<-length(types.in.1)
types.in.2<-sort(unique(data[,2])); ntypes.in.2<-length(types.in.2)
construction.freq<-length(data[,1])
x<-table(data)
W1_C<-rep(types.in.1, each=ntypes.in.2)
W2_C<-rep(types.in.2, ntypes.in.1)
Freq_W1_C<-rep(as.vector(rowSums(x)), each=ntypes.in.2)
Freq_W2_C<-rep(as.vector(colSums(x)), ntypes.in.1)
W1_W2_in_C<-as.vector(t(x))
data<-data.frame(W1_C, W2_C, Freq_W1_C, Freq_W2_C, W1_W2_in_C)
if (which.combos==2) {
data<-subset(data, data[,5]!=0)
}
# computation
cases<-length(data[,1])
words1<-data[,1]; words2<-data[,2]; freq.w1<-data[,3]; freq.w2<-data[,4]; obs.w1_2.in_c<-data[,5]
exp.w1_2.in_c<-c(rep(0, cases)); relation<-c(rep(0, cases)); delta.p.constr.to.word<-delta.p.word.to.constr<-coll.strength<-c(rep(0, cases))
for (i in 1:cases) {
obs.freq.a<-obs.w1_2.in_c[i]
obs.freq.b<-freq.w1[i]-obs.freq.a
obs.freq.c<-freq.w2[i]-obs.freq.a
obs.freq.d<-construction.freq-(obs.freq.a+obs.freq.b+obs.freq.c)
exp.freq.a<-freq.w1[i]*freq.w2[i]/construction.freq; exp.w1_2.in_c[i]<-round(exp.freq.a, which.accuracy)
exp.freq.b<-freq.w1[i]-exp.freq.a
exp.freq.c<-freq.w2[i]-exp.freq.a
exp.freq.d<-construction.freq-(exp.freq.a+exp.freq.b+exp.freq.c)
coll.strength[i]<-round(switch(which.index,
fye(obs.freq.a, exp.freq.a, freq.w1[i], construction.freq, freq.w2[i]),
llr(obs.freq.a, obs.freq.b, obs.freq.c, obs.freq.d, exp.freq.a, exp.freq.b, exp.freq.c, exp.freq.d),
log(((obs.freq.a+0.5)/(obs.freq.b+0.5))/((obs.freq.c+0.5)/(obs.freq.d+0.5)), 10)), which.accuracy)
if (obs.freq.a>exp.freq.a) {
relation[i]<-"attraction"
} else if (obs.freq.a<exp.freq.a) {
relation[i]<-"repulsion"
} else {
relation[i]<-"chance"
}
delta.p.constr.to.word[i]<-round((obs.freq.a/(obs.freq.a+obs.freq.b))-(obs.freq.c/(obs.freq.c+obs.freq.d)), which.accuracy)
delta.p.word.to.constr[i]<-round((obs.freq.a/(obs.freq.a+obs.freq.c))-(obs.freq.b/(obs.freq.b+obs.freq.d)), which.accuracy)
}
which.index<-switch(which.index, "-log10 (Fisher-Yates exact, one-tailed)", "log-likelihood", "log10 of odds ratio (adds 0.5 to each cell)")
exp.w1_2.in_c<-round(exp.w1_2.in_c, 2); output.table<-data.frame(words1, words2, freq.w1, freq.w2, obs.w1_2.in_c, exp.w1_2.in_c, relation, delta.p.constr.to.word, delta.p.word.to.constr, coll.strength)
cat("\a") # progress beep
which.output<-menu(choice=c("text file", "terminal"), title="\nWhere do you want the output ('text file' will append to already existing file with the same name)?")
# output
sort.index<-switch(which.sort, order(words1, relation, -coll.strength), order(words2, relation, -coll.strength), order(-freq.w1, relation, -coll.strength), order(-freq.w2, relation, -coll.strength), order(relation, -coll.strength))
output.table<-output.table[sort.index,]
if (which.output==1) {
cat("\nWhich text file do you want to store the result in?\n(Note: if you choose a file that already exists, the current output will be appended to this file.)\t"); pause()
output.file<-file.choose(); output<-file(output.file, open="at")
cat("|---------------------------------------------------------------------|\n| This output is provided without any warranty on an as-is basis by   |\n| Stefan Th. Gries <http://www.linguistics.ucsb.edu/faculty/stgries/> |\n| Please cite the program as mentioned in <readme.txt>. Thanks a lot! |\n|---------------------------------------------------------------------|\n\n", date(), file=output)
cat("\n\nCo-varying collexeme analysis for: ", construction.name, "\n\nwords1: words in the 1st slot of ", construction.name, "\nwords2: words in the 2nd slot of ", construction.name, "\nfreq.w1: frequency of word1 in ", construction.name, "\nfreq.w2: frequency of word2 in ", construction.name, "\nobs.w1_2.in_c: observed frequency of both words in both slots in ", construction.name, file=output)
cat("\nexp.w1_2.in_c: expected frequency of both words in both slots in ", construction.name, "\nrelation: relation between observed and expected frequency\ncoll.strength: index of co-varying collexeme strength: ", which.index, ", the higher, the stronger\n\n", sep="", file=output)
write.table(output.table, file=output, quote=F, row.names=F, sep="\t", eol="\n")
cat("\nIf your collostruction strength is based on p-values, it can be interpreted as follows:\nColl.strength>3 => p<0.001; coll.strength>2 => p<0.01; coll.strength>1.30103 => p<0.05.\nI'd be happy if you provided me with feedback and acknowledged the use of Coll.analysis 3.2a.\n", file=output)
close(output)
} else {
cat("|---------------------------------------------------------------------|\n| This output is provided without any warranty on an as-is basis by   |\n| Stefan Th. Gries <http://www.linguistics.ucsb.edu/faculty/stgries/> |\n| Please cite the program as mentioned in <readme.txt>. Thanks a lot! |\n|---------------------------------------------------------------------|\n\n", date())
cat("\n\nCo-varying collexeme analysis for: ", construction.name, "\n\nwords1: words in the 1st slot of ", construction.name, "\nwords2: words in the 2nd slot of ", construction.name, "\nfreq.w1: frequency of word1 in ", construction.name, "\nfreq.w2: frequency of word2 in ", construction.name, "\nobs.w1_2.in_c: observed frequency of both words in both slots in ", construction.name)
cat("\nexp.w1_2.in_c: expected frequency of both words in both slots in ", construction.name, "\nrelation: relation between observed and expected frequency\ncoll.strength: index of co-varying collexeme strength: ", which.index, ", the higher, the stronger\n\n", sep="")
options(width=7500); print(output.table)
cat("\nIf your collostruction strength is based on p-values, it can be interpreted as follows:\nColl.strength>3 => p<0.001; coll.strength>2 => p<0.01; coll.strength>1.30103 => p<0.05.\nI'd be happy if you provided me with feedback and acknowledged the use of Coll.analysis 3.2a.\n")
}
} # END OF FUNCTION FOR CO-VARYING COLLEXEME ANALYSIS
pause<-function() {
cat("Press <Enter> to continue ... ")
readline()
invisible()
}
fye<-function(oa, ea, cf, cs, wf) {
if(oa>ea) {
return(-log(sum(dhyper(oa:cf, cf, (cs-cf), wf)), 10))
} else {
return(-log(sum(dhyper(0:oa, cf, (cs-cf), wf)), 10))
}
}
llr<-function(oa, ob, oc, od, ea, eb, ec, ed) {
s1<-ifelse(log((oa/ea), base=exp(1))*oa=="NaN", 0, log((oa/ea), base=exp(1))*oa)
s2<-ifelse(log((ob/eb), base=exp(1))*ob=="NaN", 0, log((ob/eb), base=exp(1))*ob)
s3<-ifelse(log((oc/ec), base=exp(1))*oc=="NaN", 0, log((oc/ec), base=exp(1))*oc)
s4<-ifelse(log((od/ed), base=exp(1))*od=="NaN", 0, log((od/ed), base=exp(1))*od)
return(2*sum(s1, s2, s3, s4))
}
coll.analysis()
data = read_excel("/Users/sergeimonakhovsergomon/datreg_train.xls")
model = lm(compositionality_score_logDice ~ probability_ratio + prefix, data=data); summary(model)
plot(allEffects(model), ask=FALSE, grid=TRUE)
data = read_excel("/Users/sergeimonakhovsergomon/datreg_train.xls")
model = lm(compositionality_score_logDice ~ frequency_ratio + prefix, data=data); summary(model)
plot(allEffects(model), ask=FALSE, grid=TRUE)
# hypergeometric distribution: words
(mdat = matrix(c(46,14,17,177), nrow=2, ncol=2, byrow=F,
dimnames = list(c("double_object", "to_dative"), c("give", "other_verbs"))))
(N = sum(mdat))
(n = sum(mdat[,1]))
(K = sum(mdat[1,]))
(k = mdat[1,1])
(prob = (choose(K, k) * choose((N - K), (n - k))) / choose(N, n)) ### probability of this particular table
mdat_upd = mdat ### adding up probabilities of all more extreme values
probs = prob
for (value in seq(from=min(mdat[1,1],mdat[1,2])-1, to=0, by=-1)) {
if (min(mdat[1,1],mdat[1,2])==mdat[1,1]) {
extrem = matrix(c(value,mdat_upd[2,1]+1,mdat_upd[1,2]+1,mdat_upd[2,2]-1),
nrow=2, ncol=2, byrow=F)
} else {extrem = matrix(c(mdat_upd[1,1]+1,mdat_upd[2,1]-1,value,mdat_upd[2,2]+1),
nrow=2, ncol=2, byrow=F)}
mdat_upd = extrem
N = sum(mdat_upd)
n = sum(mdat_upd[,1])
K = sum(mdat_upd[1,])
k = mdat_upd[1,1]
prob = (choose(K, k) * choose((N - K), (n - k))) / choose(N, n)
probs = probs + prob
print(prob)
}
sum(mdat_upd) == sum(mdat)
probs ### final probability
(result = fisher.test(mdat, alternative="greater")) ### comparing this probability with that of Fisher test
result$p.value
probs
# verifying results by hand
data = read.table("1.csv", header=T)
# verifying results by hand
data = read.table("1.csv", header=T)
# verifying results by hand
data = read.table("1.csv", header=T)
read.table("1.csv", header=T)
# verifying results by hand
data = read.table("1.csv", header=T)
# verifying results by hand
data = read_excel("/Users/sergeimonakhovsergomon/1.xls")
data
word = "give"
(cell_11 = as.numeric(data$FREQ_WORD_in_DITRANSITIVE[data$WORD==word]))
(cell_12 = as.numeric(data$FREQ_WORD_in_CORPUS[data$WORD==word]) - cell_11)
(cell_21 = sum(as.numeric(data$FREQ_WORD_in_DITRANSITIVE[!data$WORD==word])))
(cell_22 = sum(as.numeric(data$FREQ_WORD_in_CORPUS[!data$WORD==word])) - cell_21)
(mdat = matrix(c(cell_11,cell_12,cell_21,cell_22), nrow=2, ncol=2, byrow=F,
dimnames = list(c("DITR", "nonDITR"), c("give", "other_verbs"))))
(result = fisher.test(mdat, alternative="greater"))
-log10(result$p.value)
sum(mdat)
sum(mdat)
mdat
# collostructional analysis package
coll.analysis()
# collostructional analysis package
coll.analysis()
# collostructional analysis package
coll.analysis()
# verifying results by hand
data = read_excel("/Users/sergeimonakhovsergomon/1.xls")
word = "tell"
(cell_11 = as.numeric(data$FREQ_WORD_in_DITRANSITIVE[data$WORD==word]))
(cell_12 = as.numeric(data$FREQ_WORD_in_CORPUS[data$WORD==word]) - cell_11)
(cell_21 = sum(as.numeric(data$FREQ_WORD_in_DITRANSITIVE[!data$WORD==word])))
(cell_22 = sum(as.numeric(data$FREQ_WORD_in_CORPUS[!data$WORD==word])) - cell_21)
(mdat = matrix(c(cell_11,cell_12,cell_21,cell_22), nrow=2, ncol=2, byrow=F,
dimnames = list(c("DITR", "nonDITR"), c("give", "other_verbs"))))
sum(mdat)
(result = fisher.test(mdat, alternative="greater"))
-log10(result$p.value)
word = "send"
(cell_11 = as.numeric(data$FREQ_WORD_in_DITRANSITIVE[data$WORD==word]))
(cell_12 = as.numeric(data$FREQ_WORD_in_CORPUS[data$WORD==word]) - cell_11)
(cell_21 = sum(as.numeric(data$FREQ_WORD_in_DITRANSITIVE[!data$WORD==word])))
(cell_22 = sum(as.numeric(data$FREQ_WORD_in_CORPUS[!data$WORD==word])) - cell_21)
(mdat = matrix(c(cell_11,cell_12,cell_21,cell_22), nrow=2, ncol=2, byrow=F,
dimnames = list(c("DITR", "nonDITR"), c("give", "other_verbs"))))
sum(mdat)
(result = fisher.test(mdat, alternative="greater"))
-log10(result$p.value)
word = "offer"
(cell_11 = as.numeric(data$FREQ_WORD_in_DITRANSITIVE[data$WORD==word]))
(cell_12 = as.numeric(data$FREQ_WORD_in_CORPUS[data$WORD==word]) - cell_11)
(cell_21 = sum(as.numeric(data$FREQ_WORD_in_DITRANSITIVE[!data$WORD==word])))
(cell_22 = sum(as.numeric(data$FREQ_WORD_in_CORPUS[!data$WORD==word])) - cell_21)
(mdat = matrix(c(cell_11,cell_12,cell_21,cell_22), nrow=2, ncol=2, byrow=F,
dimnames = list(c("DITR", "nonDITR"), c("give", "other_verbs"))))
sum(mdat)
(result = fisher.test(mdat, alternative="greater"))
-log10(result$p.value)
mdat_upd = mdat ### adding up probabilities of all more extreme values
probs = prob
for (value in seq(from=min(mdat[1,1],mdat[1,2])-1, to=0, by=-1)) {
if (min(mdat[1,1],mdat[1,2])==mdat[1,1]) {
extrem = matrix(c(value,mdat_upd[2,1]+1,mdat_upd[1,2]+1,mdat_upd[2,2]-1),
nrow=2, ncol=2, byrow=F)
} else {extrem = matrix(c(mdat_upd[1,1]+1,mdat_upd[2,1]-1,value,mdat_upd[2,2]+1),
nrow=2, ncol=2, byrow=F)}
mdat_upd = extrem
N = sum(mdat_upd)
n = sum(mdat_upd[,1])
K = sum(mdat_upd[1,])
k = mdat_upd[1,1]
prob = (choose(K, k) * choose((N - K), (n - k))) / choose(N, n)
probs = probs + prob
print(mdat_upd)
}
# hypergeometric distribution: words
(mdat = matrix(c(46,14,17,177), nrow=2, ncol=2, byrow=F,
dimnames = list(c("double_object", "to_dative"), c("give", "other_verbs"))))
(N = sum(mdat))
(n = sum(mdat[,1]))
(K = sum(mdat[1,]))
(k = mdat[1,1])
(prob = (choose(K, k) * choose((N - K), (n - k))) / choose(N, n)) ### probability of this particular table
mdat_upd = mdat ### adding up probabilities of all more extreme values
probs = prob
for (value in seq(from=min(mdat[1,1],mdat[1,2])-1, to=0, by=-1)) {
if (min(mdat[1,1],mdat[1,2])==mdat[1,1]) {
extrem = matrix(c(value,mdat_upd[2,1]+1,mdat_upd[1,2]+1,mdat_upd[2,2]-1),
nrow=2, ncol=2, byrow=F)
} else {extrem = matrix(c(mdat_upd[1,1]+1,mdat_upd[2,1]-1,value,mdat_upd[2,2]+1),
nrow=2, ncol=2, byrow=F)}
mdat_upd = extrem
N = sum(mdat_upd)
n = sum(mdat_upd[,1])
K = sum(mdat_upd[1,])
k = mdat_upd[1,1]
prob = (choose(K, k) * choose((N - K), (n - k))) / choose(N, n)
probs = probs + prob
print(mdat_upd)
}
# hypergeometric distribution: words
(mdat = matrix(c(46,14,17,177), nrow=2, ncol=2, byrow=F,
dimnames = list(c("double_object", "to_dative"), c("give", "other_verbs"))))
mdat_upd = mdat ### adding up probabilities of all more extreme values
probs = prob
for (value in seq(from=min(mdat[1,1],mdat[1,2])-1, to=0, by=-1)) {
if (min(mdat[1,1],mdat[1,2])==mdat[1,1]) {
extrem = matrix(c(value,mdat_upd[2,1]+1,mdat_upd[1,2]+1,mdat_upd[2,2]-1),
nrow=2, ncol=2, byrow=F)
} else {extrem = matrix(c(mdat_upd[1,1]+1,mdat_upd[2,1]-1,value,mdat_upd[2,2]+1),
nrow=2, ncol=2, byrow=F)}
mdat_upd = extrem
N = sum(mdat_upd)
n = sum(mdat_upd[,1])
K = sum(mdat_upd[1,])
k = mdat_upd[1,1]
prob = (choose(K, k) * choose((N - K), (n - k))) / choose(N, n)
probs = probs + prob
print(mdat_upd)
}
sum(mdat_upd) == sum(mdat)
probs ### final probability
(result = fisher.test(mdat, alternative="greater")) ### comparing this probability with that of Fisher test
result$p.value
probs
# hypergeometric distribution: words
(mdat = matrix(c(46,14,17,177), nrow=2, ncol=2, byrow=F,
dimnames = list(c("double_object", "to_dative"), c("give", "other_verbs"))))
(N = sum(mdat))
(n = sum(mdat[,1]))
(K = sum(mdat[1,]))
(k = mdat[1,1])
(prob = (choose(K, k) * choose((N - K), (n - k))) / choose(N, n)) ### probability of this particular table
mdat_upd = mdat ### adding up probabilities of all more extreme values
probs = prob
for (value in seq(from=min(mdat[1,1],mdat[1,2])-1, to=0, by=-1)) {
if (min(mdat[1,1],mdat[1,2])==mdat[1,1]) {
extrem = matrix(c(value,mdat_upd[2,1]+1,mdat_upd[1,2]+1,mdat_upd[2,2]-1),
nrow=2, ncol=2, byrow=F)
} else {extrem = matrix(c(mdat_upd[1,1]+1,mdat_upd[2,1]-1,value,mdat_upd[2,2]+1),
nrow=2, ncol=2, byrow=F)}
mdat_upd = extrem
N = sum(mdat_upd)
n = sum(mdat_upd[,1])
K = sum(mdat_upd[1,])
k = mdat_upd[1,1]
prob = (choose(K, k) * choose((N - K), (n - k))) / choose(N, n)
probs = probs + prob
print(mdat_upd)
}
sum(mdat_upd) == sum(mdat)
probs ### final probability
(result = fisher.test(mdat, alternative="greater")) ### comparing this probability with that of Fisher test
result$p.value
probs
probs
(result = fisher.test(mdat, alternative="greater")) ### comparing this probability with that of Fisher test
result$p.value
probs
# hypergeometric distribution: words
(mdat = matrix(c(46,14,17,177), nrow=2, ncol=2, byrow=F,
dimnames = list(c("double_object", "to_dative"), c("give", "other_verbs"))))
# collostructional analysis package
coll.analysis()
# verifying results by hand
data = read_excel("/Users/sergeimonakhovsergomon/1.xls")
data
word = "give"
(cell_11 = as.numeric(data$FREQ_WORD_in_DITRANSITIVE[data$WORD==word]))
(cell_12 = as.numeric(data$FREQ_WORD_in_CORPUS[data$WORD==word]) - cell_11)
(cell_21 = sum(as.numeric(data$FREQ_WORD_in_DITRANSITIVE[!data$WORD==word])))
(cell_22 = sum(as.numeric(data$FREQ_WORD_in_CORPUS[!data$WORD==word])) - cell_21)
(mdat = matrix(c(cell_11,cell_12,cell_21,cell_22), nrow=2, ncol=2, byrow=F,
dimnames = list(c("DITR", "nonDITR"), c("give", "other_verbs"))))
(result = fisher.test(mdat, alternative="greater"))
-log10(result$p.value)
word = "tell"
(cell_11 = as.numeric(data$FREQ_WORD_in_DITRANSITIVE[data$WORD==word]))
(cell_12 = as.numeric(data$FREQ_WORD_in_CORPUS[data$WORD==word]) - cell_11)
(cell_21 = sum(as.numeric(data$FREQ_WORD_in_DITRANSITIVE[!data$WORD==word])))
(cell_22 = sum(as.numeric(data$FREQ_WORD_in_CORPUS[!data$WORD==word])) - cell_21)
(mdat = matrix(c(cell_11,cell_12,cell_21,cell_22), nrow=2, ncol=2, byrow=F,
dimnames = list(c("DITR", "nonDITR"), c("give", "other_verbs"))))
(mdat = matrix(c(cell_11,cell_12,cell_21,cell_22), nrow=2, ncol=2, byrow=F,
dimnames = list(c("DITR", "nonDITR"), c(word, "other_verbs"))))
(result = fisher.test(mdat, alternative="greater"))
-log10(result$p.value)
word = "send"
(cell_11 = as.numeric(data$FREQ_WORD_in_DITRANSITIVE[data$WORD==word]))
(cell_12 = as.numeric(data$FREQ_WORD_in_CORPUS[data$WORD==word]) - cell_11)
(cell_21 = sum(as.numeric(data$FREQ_WORD_in_DITRANSITIVE[!data$WORD==word])))
(cell_22 = sum(as.numeric(data$FREQ_WORD_in_CORPUS[!data$WORD==word])) - cell_21)
(mdat = matrix(c(cell_11,cell_12,cell_21,cell_22), nrow=2, ncol=2, byrow=F,
dimnames = list(c("DITR", "nonDITR"), c(word, "other_verbs"))))
(result = fisher.test(mdat, alternative="greater"))
-log10(result$p.value)
word = "offer"
(cell_11 = as.numeric(data$FREQ_WORD_in_DITRANSITIVE[data$WORD==word]))
(cell_12 = as.numeric(data$FREQ_WORD_in_CORPUS[data$WORD==word]) - cell_11)
(cell_21 = sum(as.numeric(data$FREQ_WORD_in_DITRANSITIVE[!data$WORD==word])))
(cell_22 = sum(as.numeric(data$FREQ_WORD_in_CORPUS[!data$WORD==word])) - cell_21)
(mdat = matrix(c(cell_11,cell_12,cell_21,cell_22), nrow=2, ncol=2, byrow=F,
dimnames = list(c("DITR", "nonDITR"), c(word, "other_verbs"))))
sum(mdat)
(result = fisher.test(mdat, alternative="greater"))
-log10(result$p.value)
-log10(0.05)
-log10(0.10)
-log10(0.90)
-log10(0.20)
-log10(0.0000)
-log10(0.0002)
data = read_excel("/Users/sergeimonakhovsergomon/datreg_train.xls")
model = lm(compositionality_score_logDice ~ probability_ratio + prefix, data=data); summary(model)
model = lm(compositionality_score ~ probability_ratio + prefix, data=data); summary(model)
plot(allEffects(model), ask=FALSE, grid=TRUE)
new_data_M = read_excel("/Users/sergeimonakhovsergomon/datreg_test.xls")
predictions = predict(model, new_data_M)
cor.test(new_data_M$compositionality_score_logDice, predictions)
cor.test(new_data_M$compositionality_score, predictions)
plot(new_data_M$compositionality_score, predictions)
data = read_excel("/Users/sergeimonakhovsergomon/datreg_train.xls")
model = lm(compositionality_score ~ frequency_ratio + prefix, data=data); summary(model)
plot(allEffects(model), ask=FALSE, grid=TRUE)
new_data_H = read_excel("/Users/sergeimonakhovsergomon/datreg_test.xls")
predictions = predict(model, new_data_H)
cor.test(new_data_H$compositionality_score, predictions)
plot(new_data_H$compositionality_score, predictions)
data = read_excel("/Users/sergeimonakhovsergomon/datreg_train.xls")
model = lm(compositionality_score ~ probability_ratio + prefix, data=data); summary(model)
model = lm(compositionality_score ~ probability_ratio, data=data); summary(model)
model = lm(compositionality_score ~ probability_ratio + prefix, data=data); summary(model)
model$contrasts
data = read_excel("/Users/sergeimonakhovsergomon/datreg_train.xls")
model = lm(compositionality_score ~ frequency_ratio + prefix, data=data); summary(model)
?corr.test
?cor.test
data = read_excel("/Users/sergeimonakhovsergomon/datreg_train.xls")
model = lm(compositionality_score ~ probability_ratio + prefix, data=data); summary(model)
new_data_M = read_excel("/Users/sergeimonakhovsergomon/datreg_test.xls")
predictions = predict(model, new_data_M)
cor.test(new_data_M$compositionality_score, predictions, method='spearman')
data = read_excel("/Users/sergeimonakhovsergomon/datreg_train.xls")
model = lm(compositionality_score ~ frequency_ratio + prefix, data=data); summary(model)
new_data_H = read_excel("/Users/sergeimonakhovsergomon/datreg_test.xls")
predictions = predict(model, new_data_H)
cor.test(new_data_H$compositionality_score, predictions, method='spearman')
data = read_excel("/Users/sergeimonakhovsergomon/datreg_train.xls")
model = lm(compositionality_score ~ probability_ratio + prefix, data=data); summary(model)
plot(allEffects(model), ask=FALSE, grid=TRUE)
new_data_M = read_excel("/Users/sergeimonakhovsergomon/datreg_test.xls")
predictions = predict(model, new_data_M)
cor.test(new_data_M$compositionality_score, predictions, method='pearson')
data = read_excel("/Users/sergeimonakhovsergomon/datreg_train.xls")
model = lm(compositionality_score ~ frequency_ratio + prefix, data=data); summary(model)
new_data_H = read_excel("/Users/sergeimonakhovsergomon/datreg_test.xls")
predictions = predict(model, new_data_H)
cor.test(new_data_H$compositionality_score, predictions, method='pearson')
