Newer
Older
ez-indexation / app / node_modules / rd-teeft / lib / termextractor.js
@kieffer kieffer on 7 Mar 2017 2 KB v0.0.0
/* global module */
/* jslint node: true */
/* jslint indent: 2 */
'use strict';

var Backbone = require('backbone');

module.exports = Backbone.Model.extend({
  SEARCH: 0,
  NOUN: 1,

  defaults: {
    tagger: null,
    filter: null,
  },

  initialize: function() {

  },

  call: function(text) {
    var terms = this.get('tagger').call(text);
    return this.extract(terms);
  },

  extract: function(taggedTerms) {
    var terms = {
      _add: function(norm) {
        if (!this[norm]) {
          this[norm] = {
            frequency: 0
          };
        }
        this[norm].frequency++;
      }
    };

    //# Phase 1: A little state machine is used to build simple and
    //# composite terms.
    var multiterm = [];
    var state = this.SEARCH;
    var word;

    while (taggedTerms.length > 0) {
      var tagged_term = taggedTerms.shift();
      var term = tagged_term.term;
      var tag = tagged_term.tag;
      var norm = tagged_term.lemma;
      var startsWithN = this._startsWith(tag, 'N');
      var startsWithJ = this._startsWith(tag, 'J');

      if (state == this.SEARCH && startsWithN) {
        state = this.NOUN;
        multiterm.push(term);
        terms._add(norm);
      } else if (state == this.SEARCH && startsWithJ) {
        state = this.NOUN;
        multiterm.push(term);
        terms._add(norm);
      } else if (state == this.NOUN && startsWithN) {
        multiterm.push(term);
        terms._add(norm);
      } else if (state == this.NOUN && !startsWithN) {
        state = this.SEARCH;
        if (multiterm.length > 1) {
          word = multiterm.join(' ');
          terms._add(word);
        }
        multiterm = [];
      }
    }

    //# Phase 2: Only select the terms that fulfill the filter criteria.
    //# Also create the term strength.
    var result = {};
    delete terms._add;
    for (word in terms) {
      var occur = terms[word].frequency;
      var strength = word.split(" ").length;
      if (this.get('filter').call(occur, strength)) {
        result[word] = {
          frequency: occur,
          strength: strength
        };
      }
    }
    return result;
  },

  _startsWith: function(str, prefix) {
    return str.substring(0, prefix.length) === prefix;
  }

});