var fs = require("fs");
var DEFAULT_STOP_WORDS = require("./stop_words");

class Tfidf {
  constructor() {
    this.corpus = [];
    this.pathList = [];
    this.docs = [];
  }

  createDocumentFromFilePath(path) {
    try {
      let data = fs.readFileSync(path, { encoding: "utf8" });

      let sentenceList = data.split("\n");
      for (let i = 0; i < sentenceList.length; i++) {
        this.docs.push(sentenceList[i].trim());
      }
      //   console.log('this.docs', this.docs);
      return this.docs;
    } catch (e) {
      console.error(e);
    }
  }

  createCorpusFromStringArray(docs) {
    for (let i = 0; i < docs.length; i++) {
      let item = docs[i]
        .replace(/[\r\n]/g, " ")
        .trim()
        .split(" ");
      console.log("item", i, item);
      this.corpus.push(item);
      console.log("corpus", this.corpus);
    }
    return this.corpus;
  }

  createCorpusFromStringArray1(docs) {
    for (let i = 0; i < docs.length; i++) {
      let item = docs[i].replace(/[\r\n]/g, " ").trim();

      let sentenceSplitter = new RegExp("[^?!;()]+", "g");
      this.pathList = [];
      if (item) {
        let sentenceList = item.match(sentenceSplitter);

        this.pathList = [];
        for (var j = 0; j < sentenceList.length; j++) {
          sentenceList[j] = sentenceList[j]
            .toLowerCase()
            .replace(/([^\w]*-[^\w])+|[\s,:]+/g, " ")
            .replace(/^\s/, "")
            .replace(/\'/g, "")
            .replace(/\"/g, "")
            .split(/\s+/g);

          let newList = sentenceList[j].filter(
            (t) => !DEFAULT_STOP_WORDS.includes(t) && t !== ""
          );

          this.pathList.push(...newList);
        }
      }
      // console.log("list", i, this.pathList);
      this.corpus.push(this.pathList);
    }
    return this.corpus;
  }

  // Calculates the term frequency (tf) of a given term in a document
  calculateTermFrequency(term, doc) {
    let cnt = 0;
    for (let i = 0; i < doc.length; i++) {
      if (doc[i].toLowerCase() == term.toLowerCase()) {
        cnt++;
      }
    }
    let tf = ((cnt * 1.0) / (doc.length + 1)).toFixed(4);;

    console.log("tf:", term, tf);
    return tf;
  }

  // Calculates the inverse document frequency (idf) of a term in a given document
  // idf = log(number of documents where the term appears / term frequency)
  calculateInverseDocumentFrequency(term) {
    if (this.corpus === null) return -1;
    let cntDocs = 0;

    for (let i = 0; i < this.corpus.length; i++) {
      for (let j = 0; j < this.corpus[i].length; j++) {
        if (this.corpus[i][j] == term.toLowerCase()) {
          cntDocs++;
          break;
        }
      }
    }

    // filter the []
    let newList = this.corpus.filter(t => t.length > 0)

  
    //   let idf = Math.log(this.corpus.length / (cntDocs + 1)) + 1;
    let idf = (Math.log(newList.length / (cntDocs + 1)) + 1).toFixed(4);
    console.log("term idf: ", term, idf);
    return idf; // 防止分母为0
  }

  // Creates a vector of the idf of the query term in a given document
  createIdfModel(query) {
  //   query = Array.isArray(query) ? query : query.split(" ");
    query = Array.isArray(query) ? query : query.replace(/\(/g, " ").replace(/\)/g, " ").replace(/\'/g, "").replace(/\"/g, "").split(" ");
  
    if (this.corpus == null) return null;

    let model = [];
    for (let i = 0; i < query.length; i++) {
      model.push(this.calculateInverseDocumentFrequency(query[i]));
    }
    console.log("Idf model: ", model);
    return model;
  }

  // creates a vector of the tf-idf values for each query term
  //  tf-idf = tf * idf
  createVectorSpaceModel(query, doc) {
      query = Array.isArray(query) ? query : query.replace(/\(/g, " ").replace(/\)/g, " ").replace(/\'/g, "").replace(/\"/g, "").split(" ");
    if (this.corpus == null) return null;

    let termFrequencyModel = [];
    let vectorSpaceModel = [];

    for (let i = 0; i < query.length; i++) {
      termFrequencyModel.push(this.calculateTermFrequency(query[i], doc));
    }

    let idfModel = this.createIdfModel(query);
    console.log("termFrequencyModel:", termFrequencyModel, doc);
    for (let j = 0; j < idfModel.length; j++) {
      vectorSpaceModel[j] = (idfModel[j] * termFrequencyModel[j]).toFixed(4);
    }
  //   this.vectorSpaceModel = vectorSpaceModel;

    console.log("vectorSpaceModel: tf-idf", vectorSpaceModel, doc);
    return vectorSpaceModel;
  }

  // Calculates the magnitude of an input vector
  calculateMagnitude(vector) {
    let magnitude = 0;
    for (let i = 0; i < vector.length; i++) {
      if (isNaN(vector[i])) {
        magnitude += 0;
      } else {
        magnitude += vector[i] * vector[i];
      }
    }
    return Math.sqrt(magnitude);
  }

  // calculates the cosine similarity between two vectors
  // The higher the cosine similarity of a given document the closer of a match it is to the query.
  calculateSimilarityIndex(query, doc) {
  //   query = Array.isArray(query) ? query : query.split(" ");
  query = Array.isArray(query) ? query : query.replace(/\(/g, " ").replace(/\)/g, " ").replace(/\'/g, "").replace(/\"/g, "").split(" ");
    console.log("query: " + query);
    console.log("documents: " + doc);

    let query_vector = this.createVectorSpaceModel(query, query);
    let doc_vector = this.createVectorSpaceModel(query, doc);

    console.log("query_vector: " + query_vector);
    console.log("doc_vector: " + doc_vector);

    let similarityIndex = 0;
    for (let i = 0; i < query.length; i++) {
      let toAdd = query_vector[i] * doc_vector[i];
      if (isNaN(toAdd)) {
        similarityIndex += 0;
      } else {
        similarityIndex += toAdd;
      }
    }
    let query_mag = this.calculateMagnitude(query_vector);
    let doc_mag = this.calculateMagnitude(doc_vector);
    let similarity = +((1.0 * similarityIndex) / (query_mag * doc_mag)).toFixed(4);;

    console.log("similarity:", similarity);
    return isNaN(similarity) ? 0 : similarity;
  }

  /*
   * Ranks the documents in your corpus according to a query
   */
  rankDocumentsByQuery(query) {
      query = query.replace(/\(/g, " ").replace(/\)/g, " ").replace(/\'/g, "").replace(/\"/g, "").split(" ");

    // console.log("query", query);

    let ranking = [];
    for (let i = 0; i < this.corpus.length; i++) {
      console.log("================================================")
      console.log("lineNum:", i + 1);
      console.log("corpus[i]", this.corpus[i]);
      
      ranking.push({
        document: this.corpus[i],
        // document: this.buildFromText(),
        similarityIndex: this.calculateSimilarityIndex(
          query,
          this.corpus[i]
        ),
        line: i + 1,
      });
    }
    ranking.sort((a, b) => {
      return b.similarityIndex - a.similarityIndex;
    });
    return ranking;
  }
}

module.exports = Tfidf;
