// Copyright 2013 The Go Authors. All rights reserved. // // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file or at // https://developers.google.com/open-source/licenses/bsd. // This file implements the Paice/Husk stemming algorithm. // http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm package database import ( "bytes" "regexp" "strconv" ) const stemRuleText = ` ai*2. a*1. bb1. city3s. ci2> cn1t> dd1. dei3y> deec2ss. dee1. de2> dooh4> e1> feil1v. fi2> gni3> gai3y. ga2> gg1. ht*2. hsiug5ct. hsi3> i*1. i1y> ji1d. juf1s. ju1d. jo1d. jeh1r. jrev1t. jsim2t. jn1d. j1s. lbaifi6. lbai4y. lba3> lbi3. lib2l> lc1. lufi4y. luf3> lu2. lai3> lau3> la2> ll1. mui3. mu*2. msi3> mm1. nois4j> noix4ct. noi3> nai3> na2> nee0. ne2> nn1. pihs4> pp1. re2> rae0. ra2. ro2> ru2> rr1. rt1> rei3y> sei3y> sis2. si2> ssen4> ss0. suo3> su*2. s*1> s0. tacilp4y. ta2> tnem4> tne3> tna3> tpir2b. tpro2b. tcud1. tpmus2. tpec2iv. tulo2v. tsis0. tsi3> tt1. uqi3. ugo1. vis3j> vie0. vi2> ylb1> yli3y> ylp0. yl2> ygo1. yhp1. ymo1. ypo1. yti3> yte3> ytl2. yrtsi5. yra3> yro3> yfi3. ycn2t> yca3> zi2> zy1s. ` type stemRule struct { text string suffix []byte intact bool remove int append []byte more bool } func parseStemRules() map[byte][]*stemRule { rules := make(map[byte][]*stemRule) for _, m := range regexp.MustCompile(`(?m)(?:^| )([a-zA-Z]*)(\*?)([0-9])([a-zA-z]*)([.>])`).FindAllStringSubmatch(stemRuleText, -1) { suffix := []byte(m[1]) for i := 0; i < len(suffix)/2; i++ { j := len(suffix) - 1 - i suffix[i], suffix[j] = suffix[j], suffix[i] } remove, _ := strconv.Atoi(m[3]) r := &stemRule{ text: m[0], suffix: suffix, intact: m[2] == "*", remove: remove, append: []byte(m[4]), more: m[5] == ">", } c := suffix[len(suffix)-1] rules[c] = append(rules[c], r) } return rules } var stemRules = parseStemRules() func firstVowel(offset int, p []byte) int { for i, b := range p { switch b { case 'a', 'e', 'i', 'o', 'u': return offset + i case 'y': if offset+i > 0 { return offset + i } } } return -1 } func acceptableStem(a, b []byte) bool { i := firstVowel(0, a) if i < 0 { i = firstVowel(len(a), b) } l := len(a) + len(b) if i == 0 { return l > 1 } return i >= 0 && l > 2 } func stem(s string) string { stem := bytes.ToLower([]byte(s)) intact := true run := acceptableStem(stem, []byte{}) for run { run = false for _, rule := range stemRules[stem[len(stem)-1]] { if bytes.HasSuffix(stem, rule.suffix) && (intact || !rule.intact) && acceptableStem(stem[:len(stem)-rule.remove], rule.append) { stem = append(stem[:len(stem)-rule.remove], rule.append...) intact = false run = rule.more break } } } return string(stem) }