Newer
Older
ez-indexation / app / node_modules / javascript-lemmatizer / js / lemmatizer.js
@kieffer kieffer on 7 Mar 2017 11 KB v0.0.0
/*
* JavaScript Lemmatizer v0.0.2
* https://github.com/takafumir/javascript-lemmatizer
* MIT License
* by Takafumi Yamano
*/

var _ = require('underscore');

// extend String and define String#endsWith
if (typeof String.endsWith !== "function") {
  String.prototype.endsWith = function(suffix) {
    return this.indexOf(suffix, this.length - suffix.length) !== -1;
  };
}

var dictionaries = {};

// Lemmatizer constructor
var Lemmatizer = function() {
  this.wn_files = {
    noun: [
      '../dict/index.noun.json',
      '../dict/noun.exc.json'
    ],
    verb: [
      '../dict/index.verb.json',
      '../dict/verb.exc.json'
    ],
    adj:  [
      '../dict/index.adj.json',
      '../dict/adj.exc.json'
    ],
    adv:  [
      '../dict/index.adv.json',
      '../dict/adv.exc.json'
    ]
  };

  this.morphological_substitutions = {
    noun: [
      ['ies',  'y'  ],
      ['ves',  'f'  ],
      ['men',  'man']
    ],
    verb: [
      ['ies', 'y'],
      ['ied', 'y'],
      ['cked', 'c'],
      ['cked', 'ck'],
      ['able', 'e'],
      ['able', ''],
      ['ability', 'e'],
      ['ability', '']
    ],
    adj:  [
      ['er',  '' ],
      ['est', '' ],
      ['er',  'e'],
      ['est', 'e'],
      ['ier', 'y'],
      ['iest', 'y']
    ],
    adv:  [
      ['er',  '' ],
      ['est', '' ],
      ['er',  'e'],
      ['est', 'e'],
      ['ier', 'y'],
      ['iest', 'y']
    ]
  };

  this.wordlists  = {};
  this.exceptions = {};

  // initialize wordlists and exceptions
  for (var key in this.morphological_substitutions) {
    this.wordlists[key] = {};
    this.exceptions[key] = {};
  }

  // store dictionary data to localStorage from wn_files
  for (var pos in this.wn_files) {
    this.load_wordnet_files(pos, this.wn_files[pos][0], this.wn_files[pos][1]);
  }

  // fetch dictionary data from localStorage, then set up wordlists and exceptions
  for (var pos in this.wn_files) {
    this.setup_dic_data(pos);
  }
};

// Lemmatizer properties
Lemmatizer.prototype = {
  form: '',
  idx: '_idx',
  exc: '_exc',
  lems: [], // -> [ ["lemma1", "verb"], ["lemma2", "noun"]... ]

  // **************************************************
  // public
  // **************************************************
  // reuturn Array of ["lemma", "pos"] pairs
  // like [ ["lemma1", "verb"], ["lemma2", "noun"]... ]
  lemmas: function(form, pos) {
    var self = this;
    this.lems = [];
    this.form = form;

    var parts = ['verb', 'noun', 'adj', 'adv'];
    if ( pos && !_.include( parts, pos ) ) {
      console.log("warning: pos must be 'verb' or 'noun' or 'adj' or 'adv'.");
      return;
    }

    if (!pos) {
      _.each( parts, function(pos) { self.irregular_bases(pos); } );
      _.each( parts, function(pos) { self.regular_bases(pos); } );

      // when lemma not found and the form is included in wordlists.
      if ( this.is_lemma_empty() ) {
        _.chain(parts)
         .select( function(pos) { return self.wordlists[pos][form]; } )
         .each( function(pos) { self.lems.push([ form, pos ]); } );
      }
      // when lemma not found and the form is not included in wordlists.
      if ( this.is_lemma_empty() ) {
        this.lems.push([ form, '' ]);
      }
    } else {
      this.base_forms(pos);
      if ( this.is_lemma_empty() ) {
        this.lems.push([ form, pos ]);
      }
    }

    // sort to verb -> noun -> adv -> adj
    return _.sortBy( this.uniq_lemmas(this.lems), function(val) { return val[1]; } ).reverse();
  },

  // return only uniq lemmas without pos like [ 'high' ] or [ 'leave', 'leaf' ]
  only_lemmas: function(form, pos) {
    var result = _.map( this.lemmas(form, pos), function(val) { return val[0]; } );
    return _.uniq(result);
  },


  // **************************************************
  // private
  // The following properties(methods) are only used by
  // Lemmatizer inside, so don't call them from outside.
  // **************************************************
  is_lemma_empty: function() {
    return this.lems.length === 0;
  },

  // set up dictionary data
  load_wordnet_files: function(pos, list, exc) {
    var key_idx = pos + this.idx;
    this.open_file(key_idx, list);
    var key_exc = pos + this.exc;
    this.open_file(key_exc, exc);
  },

  setup_dic_data: function(pos) {
    var self = this;
    var key_idx = pos + this.idx;
    _.each( this.fetch_data(key_idx), function(w) {
      self.wordlists[pos][w] = w;
    });
    var key_exc = pos + this.exc;
    _.each( this.fetch_data(key_exc), function(item) {
      var w = item[0];
      var s = item[1];
      self.exceptions[pos][w] = s;
    });
  },

  open_file: function(key, file) {
    if (!dictionaries[key]) {
      dictionaries[key] = require(file);
    }
  },

  fetch_data: function(key) {
    return dictionaries[key];
  },
  // end of set up dictionary data

  base_forms: function(pos) {
    this.irregular_bases(pos);
    this.regular_bases(pos);
  },

  // build array lemmas(this.lems) like [ [lemma1, "verb"], [lemma2, "noun"]... ]
  irregular_bases: function(pos) {
    if (this.exceptions[pos][this.form] && this.exceptions[pos][this.form] !== this.form) {
      this.lems.push( [this.exceptions[pos][this.form], pos] );
    }
  },

  // build array lemmas(this.lems) like [ [lemma1, "verb"], [lemma2, "noun"]... ]
  regular_bases: function(pos) {
    var bases = null;
    // bases -> [ [lemma1, lemma2, lemma3...], pos ]
    switch (pos){
      case 'verb':
        bases = this.possible_verb_bases();
        break;
      case 'noun':
        bases = this.possible_noun_bases();
        break;
      case 'adj':
        bases = this.possible_adj_adv_bases('adj');
        break;
      case 'adv':
        bases = this.possible_adj_adv_bases('adv');
        break;
      default:
        break;
    }
    if (bases) {
      this.check_lemmas(bases);
    }
  },

  // check if possible bases are include in lemma wordlists and push
  check_lemmas: function(bases) {
    var self = this;
    // bases -> [ [lemma1, lemma2, lemma3...], pos ]
    var lemmas = bases[0];
    var pos = bases[1];
    _.each( lemmas, function(lemma) {
      if ( self.wordlists[pos][lemma] && self.wordlists[pos][lemma] === lemma ) {
        self.lems.push( [lemma, pos] );
      }
    });
  },

  possible_verb_bases: function() {
    var form = this.form;
    var lemmas = [];

    if ( this.ends_with_es() ) {
      // goes -> go
      var verb_base = form.slice( 0, -2 );
      lemmas.push( verb_base );
      if ( !this.wordlists['verb'][verb_base] || this.wordlists['verb'][verb_base] !== verb_base ) {
        // opposes -> oppose
        lemmas.push( form.slice( 0, -1 ) );
      }
    } else if ( this.ends_with_verb_vowel_ys() ) {
      // annoys -> annoy
      lemmas.push( form.slice( 0, -1 ) );
    } else if ( form.endsWith('ed') && !form.endsWith('ied') && !form.endsWith('cked') ) {
      // saved -> save
      var past_base = form.slice( 0, -1 );
      lemmas.push( past_base );
      if ( !this.wordlists['verb'][past_base] || this.wordlists['verb'][past_base] !== past_base ) {
        // talked -> talk, but not push like coded -> cod
        lemmas.push( form.slice( 0, -2 ) );
      }
    } else if ( form.endsWith('ed') && this.double_consonant('ed') ) {
      // dragged -> drag
      lemmas.push( form.slice( 0, -3 ) );
      // added -> add
      lemmas.push( form.slice( 0, -2 ) );
      // pirouetted -> pirouette
      lemmas.push( form.slice( 0, -2 ) + 'e' );
    } else if ( form.endsWith('ing') && this.double_consonant('ing') ) {
      // dragging -> drag
      lemmas.push( form.slice( 0, -4 ) );
      // adding -> add
      lemmas.push( form.slice( 0, -3 ) );
      // pirouetting -> pirouette
      lemmas.push( form.slice( 0, -3 ) + 'e' );
    } else if ( form.endsWith('ing') && !this.exceptions['verb'][form] ) {
      // coding -> code
      var ing_base = form.slice( 0, -3 ) + 'e';
      lemmas.push( ing_base );
      if ( !this.wordlists['verb'][ing_base] || this.wordlists['verb'][ing_base] !== ing_base ) {
        // talking -> talk, but not push like coding -> cod
        lemmas.push( form.slice( 0, -3 ) );
      }
    } else if ( form.endsWith('able') && this.double_consonant('able') ) {
      lemmas.push( form.slice( 0, -5 ) );
    } else if ( form.endsWith('ability') && this.double_consonant('ability') ) {
      lemmas.push( form.slice( 0, -8 ) );
     } else if ( form.endsWith('s') ) {
      lemmas.push( form.slice( 0, -1 ) );
    }

    _.each(this.morphological_substitutions["verb"], function(entry) {
      var morpho = entry[0];
      var origin = entry[1];
      if ( form.endsWith(morpho) ) {
        lemmas.push( form.slice( 0, -(morpho.length) ) + origin );
      }
    });

    lemmas.push(form);

    return [ lemmas, 'verb' ];
  },

  possible_noun_bases: function() {
    var form = this.form;
    var lemmas = [];

    if ( this.ends_with_es() ) {
      // watches -> watch
      var noun_base = form.slice( 0, -2 );
      lemmas.push( noun_base );
      if ( !this.wordlists['noun'][noun_base] || this.wordlists['noun'][noun_base] !== noun_base ) {
        // horses -> horse
        lemmas.push( form.slice( 0, -1 ) );
      }
    } else if ( form.endsWith('s') ) {
      lemmas.push( form.slice( 0, -1 ) );
    }

    _.each(this.morphological_substitutions["noun"], function(entry) {
      var morpho = entry[0];
      var origin = entry[1];
      if ( form.endsWith(morpho) ) {
        lemmas.push( form.slice( 0, -(morpho.length) ) + origin );
      }
    });

    // to push a word like 'us' as it is
    lemmas.push(form);

    return [ lemmas, 'noun' ];
  },

  possible_adj_adv_bases: function(pos) {
    var form = this.form;
    var lemmas = [];

    if ( form.endsWith('est') && this.double_consonant('est') ) {
      // biggest -> big
      lemmas.push( form.slice( 0, -4 ) );
    } else if ( form.endsWith('er') && this.double_consonant('er') ) {
      // bigger -> bigger
      lemmas.push( form.slice( 0, -3 ) );
    }

    _.each(this.morphological_substitutions[pos], function(entry) {
      var morpho = entry[0];
      var origin = entry[1];
      if ( form.endsWith(morpho) ) {
        lemmas.push( form.slice( 0, -(morpho.length) ) + origin );
      }
    });

    // to push a word like 'after' as it is
    lemmas.push(form);

    return [ lemmas, pos ];
  },

  double_consonant: function(suffix) {
    // for like bigger -> big
    var form = this.form;
    // length after removing suffix from form
    var len = form.length - suffix.length;
    return this.is_vowel(form[len - 3]) && !this.is_vowel(form[len - 2]) && form[len - 2] === form[len - 1];
  },

  is_vowel: function(letter) {
    return _.include(["a", "e", "i", "o", "u"], letter);
  },

  // [ ["leave", "verb"], ["leaf", "noun"], ["leave", "verb"], ["leave", "noun"] ];
  // -> [ ["leave", "verb"], ["leaf", "noun"], ["leave", "noun"] ];
  uniq_lemmas: function(lemmas) {
    var u_lemmas = [];
    var len = lemmas.length;
    for (var i = 0; i < len; i++) {
      var val = lemmas[i];
      if (!this.is_include(u_lemmas, val) && val[0].length > 1) {
        u_lemmas.push(val);
      }
    }
    return u_lemmas;
  },

  is_include: function(lemmas, target) {
    var len = lemmas.length;
    for (var i = 0; i < len; i++) {
      if (lemmas[i][0] === target[0] && lemmas[i][1] === target[1]) {
        return true;
      }
    }
    return false;
  },

  ends_with_es: function() {
    var result = false;
    var form = this.form;
    var ends = ['ches', 'shes', 'oes', 'ses', 'xes', 'zes'];
    _.each( ends, function(end) {
      if ( form.endsWith(end) ) {
        result = true;
      }
    });
    return result;
  },

  ends_with_verb_vowel_ys: function() {
    var result = false;
    var form = this.form;
    var ends = ['ays', 'eys', 'iys', 'oys', 'uys'];
    _.each( ends, function(end) {
      if ( form.endsWith(end) ) {
        result = true;
      }
    });
    return result;
  }
};

module.exports = Lemmatizer;