You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

124 lines
2.8 KiB

  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. //
  3. // Use of this source code is governed by a BSD-style
  4. // license that can be found in the LICENSE file or at
  5. // https://developers.google.com/open-source/licenses/bsd.
  6. // This file implements the Paice/Husk stemming algorithm.
  7. // http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm
  8. package database
  9. import (
  10. "bytes"
  11. "regexp"
  12. "strconv"
  13. )
  14. const stemRuleText = `
  15. ai*2. a*1.
  16. bb1.
  17. city3s. ci2> cn1t>
  18. dd1. dei3y> deec2ss. dee1. de2> dooh4>
  19. e1>
  20. feil1v. fi2>
  21. gni3> gai3y. ga2> gg1.
  22. ht*2. hsiug5ct. hsi3>
  23. i*1. i1y>
  24. ji1d. juf1s. ju1d. jo1d. jeh1r. jrev1t. jsim2t. jn1d. j1s.
  25. lbaifi6. lbai4y. lba3> lbi3. lib2l> lc1. lufi4y. luf3> lu2. lai3> lau3> la2> ll1.
  26. mui3. mu*2. msi3> mm1.
  27. nois4j> noix4ct. noi3> nai3> na2> nee0. ne2> nn1.
  28. pihs4> pp1.
  29. re2> rae0. ra2. ro2> ru2> rr1. rt1> rei3y>
  30. sei3y> sis2. si2> ssen4> ss0. suo3> su*2. s*1> s0.
  31. tacilp4y. ta2> tnem4> tne3> tna3> tpir2b. tpro2b. tcud1. tpmus2. tpec2iv. tulo2v. tsis0. tsi3> tt1.
  32. uqi3. ugo1.
  33. vis3j> vie0. vi2>
  34. ylb1> yli3y> ylp0. yl2> ygo1. yhp1. ymo1. ypo1. yti3> yte3> ytl2. yrtsi5. yra3> yro3> yfi3. ycn2t> yca3>
  35. zi2> zy1s.
  36. `
  37. type stemRule struct {
  38. text string
  39. suffix []byte
  40. intact bool
  41. remove int
  42. append []byte
  43. more bool
  44. }
  45. func parseStemRules() map[byte][]*stemRule {
  46. rules := make(map[byte][]*stemRule)
  47. for _, m := range regexp.MustCompile(`(?m)(?:^| )([a-zA-Z]*)(\*?)([0-9])([a-zA-z]*)([.>])`).FindAllStringSubmatch(stemRuleText, -1) {
  48. suffix := []byte(m[1])
  49. for i := 0; i < len(suffix)/2; i++ {
  50. j := len(suffix) - 1 - i
  51. suffix[i], suffix[j] = suffix[j], suffix[i]
  52. }
  53. remove, _ := strconv.Atoi(m[3])
  54. r := &stemRule{
  55. text: m[0],
  56. suffix: suffix,
  57. intact: m[2] == "*",
  58. remove: remove,
  59. append: []byte(m[4]),
  60. more: m[5] == ">",
  61. }
  62. c := suffix[len(suffix)-1]
  63. rules[c] = append(rules[c], r)
  64. }
  65. return rules
  66. }
  67. var stemRules = parseStemRules()
  68. func firstVowel(offset int, p []byte) int {
  69. for i, b := range p {
  70. switch b {
  71. case 'a', 'e', 'i', 'o', 'u':
  72. return offset + i
  73. case 'y':
  74. if offset+i > 0 {
  75. return offset + i
  76. }
  77. }
  78. }
  79. return -1
  80. }
  81. func acceptableStem(a, b []byte) bool {
  82. i := firstVowel(0, a)
  83. if i < 0 {
  84. i = firstVowel(len(a), b)
  85. }
  86. l := len(a) + len(b)
  87. if i == 0 {
  88. return l > 1
  89. }
  90. return i >= 0 && l > 2
  91. }
  92. func stem(s string) string {
  93. stem := bytes.ToLower([]byte(s))
  94. intact := true
  95. run := acceptableStem(stem, []byte{})
  96. for run {
  97. run = false
  98. for _, rule := range stemRules[stem[len(stem)-1]] {
  99. if bytes.HasSuffix(stem, rule.suffix) &&
  100. (intact || !rule.intact) &&
  101. acceptableStem(stem[:len(stem)-rule.remove], rule.append) {
  102. stem = append(stem[:len(stem)-rule.remove], rule.append...)
  103. intact = false
  104. run = rule.more
  105. break
  106. }
  107. }
  108. }
  109. return string(stem)
  110. }