#!/usr/bin/env python
# -*- coding: utf-8 -*-
# author: Hugo Andrade (Chalmers University of Technology)

#Standardize string encoding in UTF-8
from __future__ import unicode_literals
import sys 
reload( sys )
sys.setdefaultencoding('utf-8')

import string, nltk, codecs

from nltk import wordpunct_tokenize, FreqDist, bigrams, collocations, WordPunctTokenizer
from nltk.collocations import *
from nltk.corpus import stopwords

#stemming words
#from nltk.stem.porter import PorterStemmer 


def readtext( name_file ):

	#open file with specific encoding
	with codecs.open(name_file, 'r', encoding='utf8') as f:
		document = f.read()
	
	#classic open file
    #document = open(name_file).read()  

	return document 
    
    
def normalize( document ):
	
	#Removes punctuation 
	out = document.translate({ord(k): None for k in string.punctuation})
		
	#Tokenizes document
	tokens = wordpunct_tokenize(out)
	
	#Removes unicode markings from tokens  
	tokens = [str(item) for item in tokens]
	
	#Converts all tokens to lowercase
	tokens = [token.lower() for token in tokens]
	
	#Removes stopwords
	stopwords = nltk.corpus.stopwords.words('english')
	tokens = [w for w in tokens if w not in stopwords]
	
	#Removes common words from exporting IEEE/ScienceDirect/Scopus/WebofScience 
	blacklist = ['conference', 'publications', 'international', 'ieee', 'comput', 'sci', 
	'journals', 'magazines', 'paper', 'presents', 'symposium', 'paper', 'present', 
	'eng', 'univ', 'technol', 'inst', 'inf', 'proceedings', 'sept', 'abstract', 'pages', 
	'issue', 'journal', 'issn', 'january', 'publisher', 'summary', 'morgan', 'volume', 
	'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 
	'november', 'december', 'kaufmann', '1', '\xe2\x80\x99', 'scopus', 'document', 'type', 
	'source', 'pp', 'york', 'united', 'author', 'keywords', 'article', 
	'report', 'acm', 'press', 'ny', 'usa', 'springer', 'washington', 'cited',
	'online', 'conf', 'kingdom', 'heidelberg', 'wiley', 'sigcomm', 
	'proc', 'phd', 'thesis', 'prentice', 'institute', 'lecture', 'university', 'rep',
	'diego', 'et', '\xc2\xa9', 'doi']
	
	tokens = [w for w in tokens if w not in blacklist]
	
	return tokens


			
def ngramize( document ):
	
	#bigrams
	#ngrams = nltk.bigrams(document)  
	
	#trigrams
	ngrams = nltk.trigrams(document)  
	
	return ngrams


	
################################  Perhaps useful ########################################			
# Stemming http://www.nltk.org/api/nltk.stem.html #
# 
# 	stem = PorterStemmer()	
# 	tokens = [stem.stem( t.lower() ) for t in tokens]
#
#########################################################################################
# Print text as demo 		
# 
# def printtext( document, normalized_document ):
# 
#     for i, sentence in enumerate( document ):
#         
#         print 'original:', sentence
#         print 'normalized:', ' '.join( normalized_document[ i ] )
#         bigram = ['$'] + normalized_document[ i ] + ['$']
#         print 'bigrams:', zip( bigram, bigram[1:])
#         print '-'*50		
#
#########################################################################################
# Normalize 
# 
#	normalized_document = list()
#     for sentence in document:
#         #Sentence in an array of tokens (words and punctiation)
#         tokens = wordpunct_tokenize( sentence )
# 
#         #Stem 
#         normalized_tokens = [stem.stem( t.lower() ) for t in tokens ]
# 
#         #Removes stopwords 
#         normalized_tokens = [ t for t in tokens if not t in stopwords ]
# 
#         normalized_document.append( normalized_tokens )
# 
########################################################################################

    


if __name__ == '__main__':

    #file_name = 'titles_abstracts_keyords_146.txt'
    file_name = 'metadata.txt'
   
	#Read file   
    document = readtext( file_name )
    
    #Normalize document
    normalized_document = normalize( document )
        
    #Create bigrams (or trigrams)    
    ngrams = ngramize(normalized_document)
    
    #Calculate frequency distribution for all the ngrams
    fdist = nltk.FreqDist(ngrams)
    
    #Set number of most common ngrams to calculate
    temp = fdist.most_common(100) 
	
	#Prints every ngram per line 
    for a in temp:
       print a