var fs = require("fs");
var DEFAULT_STOP_WORDS = require("./stop_words");

class Tfidf {
  constructor() {
    this.corpus = [];
    this.pathList = [];
    this.tracker = [];
  }

  buildFromText() {
    if (this.corpus === null) return -1;
    let list = {};

    for (var i = 0; i < this.corpus[0].length; i++) {
      let bucket = list;
      let end = this.corpus[0][i];
      if (end in bucket) {
        bucket[end]++;
      } else {
        bucket[end] = 1;
      }
    }

    let nList = Object.fromEntries(
      Object.entries(list).sort((a, b) => b[1] - a[1])
    );
    return nList;
  }

  countTermFrequencies(list) {
    let cntList = {};
    for (let i in list) {
      if (typeof list[i] === "number") {
        if (list[i] in cntList) {
          cntList[list[i]].push(i);
        } else {
          cntList[list[i]] = [i];
        }
      } else {
        for (let j in list) {
          if (list[[i][j]] in cntList) {
            cntList[list[i][j]].push(i + " " + j);
          } else {
            cntList[list[i][j]] = [i + " " + j];
          }
        }
      }
    }
    return cntList;
  }

  // add a string into an array of words
  addDocumentFromStr(str) {

    this.pathList = [];
    let sentenceSplitter = new RegExp("[^.?!;()]+", "g");

      let sentenceList = str.match(sentenceSplitter);
      for (var i = 0; i < sentenceList.length; i++) {
        sentenceList[i] = sentenceList[i].toLowerCase()
          .replace(/([^\w]*-[^\w])+|[\s,:]+/g, " ")
          .replace(/^\s/, "")
          .replace(/\'/g, "")
          .split(/\s+/g);
      }

      for (let i = 0; i< sentenceList.length; i++) {

        let newList = sentenceList[i].filter((t) => !DEFAULT_STOP_WORDS.includes(t) && t!== "")
        this.pathList.push(...newList)
      }
      this.corpus.push(this.pathList)

    // let strArray = str
    //   .replace(/[\r\n]/g, " ")
    //   .trim()
    //   .split(" ")
    //   .filter((t) => !DEFAULT_STOP_WORDS.includes(t));
    this.corpus.push(strArray);
    this.tracker.push({
      index: this.corpus.length - 1,
      document: str,
    });
    return this.corpus;
  }

  // add document from file path
  addDocumentFromFilePath(path) {
    this.pathList = [];
    try {
      let data = fs.readFileSync(path, { encoding: "utf8" });
      data = data.replace(/[\r\n]/g, " ");
      data = data.trim();

      let sentenceSplitter = new RegExp("[^.?!;()]+", "g");

      let sentenceList = data.match(sentenceSplitter);
      for (var i = 0; i < sentenceList.length; i++) {
        sentenceList[i] = sentenceList[i].toLowerCase()
          .replace(/([^\w]*-[^\w])+|[\s,:]+/g, " ")
          .replace(/^\s/, "")
          .replace(/\'/g, "")
          .split(/\s+/g);
      }

      for (let i = 0; i< sentenceList.length; i++) {

        let newList = sentenceList[i].filter((t) => !DEFAULT_STOP_WORDS.includes(t) && t!== "")
        this.pathList.push(...newList)
      }
      this.corpus.push(this.pathList)


      // let strArray = data
      //   .split(" ")
      //   .filter((t) => !DEFAULT_STOP_WORDS.includes(t) && t !== "");
      // this.corpus.push(strArray);

      // console.log('-------corpus', this.corpus)
      this.tracker.push({
        index: this.corpus.length - 1,
        document: path,
      });
    } catch (err) {
      throw err;
    }
    return this.corpus;
  }

  // Calculates the term frequency (tf) of a given term in a document
  calculateTermFrequency(term, doc) {
    let cnt = 0;
    for (let i = 0; i < doc.length; i++) {
      if (doc[i].toLowerCase() == term.toLowerCase()) {
        cnt++;
      }
    }
    return (cnt * 1.0) / (doc.length + 1);
  }

  // Calculates the inverse document frequency (idf) of a term in a given document
  // idf = log(number of documents where the term appears / term frequency)
  calculateInverseDocumentFrequency(term) {
    if (this.corpus === null) return -1;
    let cntDocs = 0;

    for (let i = 0; i < this.corpus.length; i++) {
      for (let j = 0; j < this.corpus[i].length; j++) {
        if (this.corpus[i][j] == term.toLowerCase()) {
          cntDocs++;
          break;
        }
      }
    }
    return Math.log(this.corpus.length / (cntDocs + 1)) + 1; // 防止分母为0
  }

  // Creates a vector of the idf of the query term in a given document
  createIdfModel(query) {
    query = Array.isArray(query) ? query : query.split(" ");
    if (this.corpus == null) return null;

    let model = [];
    for (let i = 0; i < query.length; i++) {
      model.push(this.calculateInverseDocumentFrequency(query[i]));
    }
    return model;
  }

  // creates a vector of the tf-idf values for each query term
  //  tf-idf = tf * idf
  createVectorSpaceModel(query, doc) {
    query = Array.isArray(query) ? query : query.split(" ");
    if (this.corpus == null) return null;

    let termFrequencyModel = [];
    let vectorSpaceModel = [];

    for (let i = 0; i < query.length; i++) {
      termFrequencyModel.push(this.calculateTermFrequency(query[i], doc));
    }

    let idfModel = this.createIdfModel(query);
    for (let j = 0; j < idfModel.length; j++) {
      vectorSpaceModel[j] = idfModel[j] * termFrequencyModel[j];
    }
    this.vectorSpaceModel = vectorSpaceModel;
    return vectorSpaceModel;
  }

  // Calculates the magnitude of an input vector
  calculateMagnitude(vector) {
    let magnitude = 0;
    for (let i = 0; i < vector.length; i++) {
      if (isNaN(vector[i])) {
        magnitude += 0;
      } else {
        magnitude += vector[i] * vector[i];
      }
    }
    return Math.sqrt(magnitude);
  }

  // calculates the cosine similarity between two vectors
  // The higher the cosine similarity of a given document the closer of a match it is to the query.
  calculateSimilarityIndex(query, doc) {
    query = Array.isArray(query) ? query : query.split(" ");
    let query_vector = this.createVectorSpaceModel(query, query);
    let doc_vector = this.createVectorSpaceModel(query, doc);
    let similarityIndex = 0;
    for (let i = 0; i < query.length; i++) {
      let toAdd = query_vector[i] * doc_vector[i];
      if (isNaN(toAdd)) {
        similarityIndex += 0;
      } else {
        similarityIndex += toAdd;
      }
    }
    let query_mag = this.calculateMagnitude(query_vector);
    let doc_mag = this.calculateMagnitude(doc_vector);
    let similarity = (1.0 * similarityIndex) / (query_mag * doc_mag);
    return isNaN(similarity) ? 0 : similarity;
  }

  /*
   * Ranks the documents in your corpus according to a query
   */
  rankDocumentsByQuery(query) {
    query = query.replace(/\./g, "").split(" ");

    let ranking = [];
    for (let i = 0; i < this.corpus.length; i++) {
      ranking.push({
        // document: this.corpus[i],
        document: this.buildFromText(),
        similarityIndex: this.calculateSimilarityIndex(query, this.corpus[i]),
        // index: i,
      });
    }
    ranking.sort((a, b) => {
      return b.similarityIndex - a.similarityIndex;
    });
    return ranking;
  }

  // get tracker
  getTrackers() {
    return this.tracker;
  }
}

module.exports = Tfidf;
