|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123 |
- // Copyright 2013 The Go Authors. All rights reserved.
- //
- // Use of this source code is governed by a BSD-style
- // license that can be found in the LICENSE file or at
- // https://developers.google.com/open-source/licenses/bsd.
-
- // This file implements the Paice/Husk stemming algorithm.
- // http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm
-
- package database
-
- import (
- "bytes"
- "regexp"
- "strconv"
- )
-
- const stemRuleText = `
- ai*2. a*1.
- bb1.
- city3s. ci2> cn1t>
- dd1. dei3y> deec2ss. dee1. de2> dooh4>
- e1>
- feil1v. fi2>
- gni3> gai3y. ga2> gg1.
- ht*2. hsiug5ct. hsi3>
- i*1. i1y>
- ji1d. juf1s. ju1d. jo1d. jeh1r. jrev1t. jsim2t. jn1d. j1s.
- lbaifi6. lbai4y. lba3> lbi3. lib2l> lc1. lufi4y. luf3> lu2. lai3> lau3> la2> ll1.
- mui3. mu*2. msi3> mm1.
- nois4j> noix4ct. noi3> nai3> na2> nee0. ne2> nn1.
- pihs4> pp1.
- re2> rae0. ra2. ro2> ru2> rr1. rt1> rei3y>
- sei3y> sis2. si2> ssen4> ss0. suo3> su*2. s*1> s0.
- tacilp4y. ta2> tnem4> tne3> tna3> tpir2b. tpro2b. tcud1. tpmus2. tpec2iv. tulo2v. tsis0. tsi3> tt1.
- uqi3. ugo1.
- vis3j> vie0. vi2>
- ylb1> yli3y> ylp0. yl2> ygo1. yhp1. ymo1. ypo1. yti3> yte3> ytl2. yrtsi5. yra3> yro3> yfi3. ycn2t> yca3>
- zi2> zy1s.
- `
-
- type stemRule struct {
- text string
- suffix []byte
- intact bool
- remove int
- append []byte
- more bool
- }
-
- func parseStemRules() map[byte][]*stemRule {
-
- rules := make(map[byte][]*stemRule)
- for _, m := range regexp.MustCompile(`(?m)(?:^| )([a-zA-Z]*)(\*?)([0-9])([a-zA-z]*)([.>])`).FindAllStringSubmatch(stemRuleText, -1) {
-
- suffix := []byte(m[1])
- for i := 0; i < len(suffix)/2; i++ {
- j := len(suffix) - 1 - i
- suffix[i], suffix[j] = suffix[j], suffix[i]
- }
-
- remove, _ := strconv.Atoi(m[3])
- r := &stemRule{
- text: m[0],
- suffix: suffix,
- intact: m[2] == "*",
- remove: remove,
- append: []byte(m[4]),
- more: m[5] == ">",
- }
- c := suffix[len(suffix)-1]
- rules[c] = append(rules[c], r)
- }
- return rules
- }
-
- var stemRules = parseStemRules()
-
- func firstVowel(offset int, p []byte) int {
- for i, b := range p {
- switch b {
- case 'a', 'e', 'i', 'o', 'u':
- return offset + i
- case 'y':
- if offset+i > 0 {
- return offset + i
- }
- }
- }
- return -1
- }
-
- func acceptableStem(a, b []byte) bool {
- i := firstVowel(0, a)
- if i < 0 {
- i = firstVowel(len(a), b)
- }
- l := len(a) + len(b)
- if i == 0 {
- return l > 1
- }
- return i >= 0 && l > 2
- }
-
- func stem(s string) string {
- stem := bytes.ToLower([]byte(s))
- intact := true
- run := acceptableStem(stem, []byte{})
- for run {
- run = false
- for _, rule := range stemRules[stem[len(stem)-1]] {
- if bytes.HasSuffix(stem, rule.suffix) &&
- (intact || !rule.intact) &&
- acceptableStem(stem[:len(stem)-rule.remove], rule.append) {
- stem = append(stem[:len(stem)-rule.remove], rule.append...)
- intact = false
- run = rule.more
- break
- }
- }
- }
- return string(stem)
- }
|