You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

210 lines
6.3 KiB

  1. // Copyright 2015 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. //go:generate go run gen.go
  5. // Package ianaindex maps names to Encodings as specified by the IANA registry.
  6. // This includes both the MIME and IANA names.
  7. //
  8. // See http://www.iana.org/assignments/character-sets/character-sets.xhtml for
  9. // more details.
  10. package ianaindex
  11. import (
  12. "errors"
  13. "sort"
  14. "strings"
  15. "golang.org/x/text/encoding"
  16. "golang.org/x/text/encoding/charmap"
  17. "golang.org/x/text/encoding/internal/identifier"
  18. "golang.org/x/text/encoding/japanese"
  19. "golang.org/x/text/encoding/korean"
  20. "golang.org/x/text/encoding/simplifiedchinese"
  21. "golang.org/x/text/encoding/traditionalchinese"
  22. "golang.org/x/text/encoding/unicode"
  23. )
  24. // TODO: remove the "Status... incomplete" in the package doc comment.
  25. // TODO: allow users to specify their own aliases?
  26. // TODO: allow users to specify their own indexes?
  27. // TODO: allow canonicalizing names
  28. // NOTE: only use these top-level variables if we can get the linker to drop
  29. // the indexes when they are not used. Make them a function or perhaps only
  30. // support MIME otherwise.
  31. var (
  32. // MIME is an index to map MIME names.
  33. MIME *Index = mime
  34. // IANA is an index that supports all names and aliases using IANA names as
  35. // the canonical identifier.
  36. IANA *Index = iana
  37. // MIB is an index that associates the MIB display name with an Encoding.
  38. MIB *Index = mib
  39. mime = &Index{mimeName, ianaToMIB, ianaAliases, encodings[:]}
  40. iana = &Index{ianaName, ianaToMIB, ianaAliases, encodings[:]}
  41. mib = &Index{mibName, ianaToMIB, ianaAliases, encodings[:]}
  42. )
  43. // Index maps names registered by IANA to Encodings.
  44. // Currently different Indexes only differ in the names they return for
  45. // encodings. In the future they may also differ in supported aliases.
  46. type Index struct {
  47. names func(i int) string
  48. toMIB []identifier.MIB // Sorted slice of supported MIBs
  49. alias map[string]int
  50. enc []encoding.Encoding
  51. }
  52. var (
  53. errInvalidName = errors.New("ianaindex: invalid encoding name")
  54. errUnknown = errors.New("ianaindex: unknown Encoding")
  55. errUnsupported = errors.New("ianaindex: unsupported Encoding")
  56. )
  57. // Encoding returns an Encoding for IANA-registered names. Matching is
  58. // case-insensitive.
  59. func (x *Index) Encoding(name string) (encoding.Encoding, error) {
  60. name = strings.TrimSpace(name)
  61. // First try without lowercasing (possibly creating an allocation).
  62. i, ok := x.alias[name]
  63. if !ok {
  64. i, ok = x.alias[strings.ToLower(name)]
  65. if !ok {
  66. return nil, errInvalidName
  67. }
  68. }
  69. return x.enc[i], nil
  70. }
  71. // Name reports the canonical name of the given Encoding. It will return an
  72. // error if the e is not associated with a known encoding scheme.
  73. func (x *Index) Name(e encoding.Encoding) (string, error) {
  74. id, ok := e.(identifier.Interface)
  75. if !ok {
  76. return "", errUnknown
  77. }
  78. mib, _ := id.ID()
  79. if mib == 0 {
  80. return "", errUnknown
  81. }
  82. v := findMIB(x.toMIB, mib)
  83. if v == -1 {
  84. return "", errUnsupported
  85. }
  86. return x.names(v), nil
  87. }
  88. // TODO: the coverage of this index is rather spotty. Allowing users to set
  89. // encodings would allow:
  90. // - users to increase coverage
  91. // - allow a partially loaded set of encodings in case the user doesn't need to
  92. // them all.
  93. // - write an OS-specific wrapper for supported encodings and set them.
  94. // The exact definition of Set depends a bit on if and how we want to let users
  95. // write their own Encoding implementations. Also, it is not possible yet to
  96. // only partially load the encodings without doing some refactoring. Until this
  97. // is solved, we might as well not support Set.
  98. // // Set sets the e to be used for the encoding scheme identified by name. Only
  99. // // canonical names may be used. An empty name assigns e to its internally
  100. // // associated encoding scheme.
  101. // func (x *Index) Set(name string, e encoding.Encoding) error {
  102. // panic("TODO: implement")
  103. // }
  104. func findMIB(x []identifier.MIB, mib identifier.MIB) int {
  105. i := sort.Search(len(x), func(i int) bool { return x[i] >= mib })
  106. if i < len(x) && x[i] == mib {
  107. return i
  108. }
  109. return -1
  110. }
  111. const maxMIMENameLen = '0' - 1 // officially 40, but we leave some buffer.
  112. func mimeName(x int) string {
  113. n := ianaNames[x]
  114. // See gen.go for a description of the encoding.
  115. if n[0] <= maxMIMENameLen {
  116. return n[1:n[0]]
  117. }
  118. return n
  119. }
  120. func ianaName(x int) string {
  121. n := ianaNames[x]
  122. // See gen.go for a description of the encoding.
  123. if n[0] <= maxMIMENameLen {
  124. return n[n[0]:]
  125. }
  126. return n
  127. }
  128. func mibName(x int) string {
  129. return mibNames[x]
  130. }
  131. var encodings = [numIANA]encoding.Encoding{
  132. enc106: unicode.UTF8,
  133. enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM),
  134. enc1013: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
  135. enc1014: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
  136. enc2028: charmap.CodePage037,
  137. enc2011: charmap.CodePage437,
  138. enc2009: charmap.CodePage850,
  139. enc2010: charmap.CodePage852,
  140. enc2046: charmap.CodePage855,
  141. enc2089: charmap.CodePage858,
  142. enc2048: charmap.CodePage860,
  143. enc2013: charmap.CodePage862,
  144. enc2050: charmap.CodePage863,
  145. enc2052: charmap.CodePage865,
  146. enc2086: charmap.CodePage866,
  147. enc2102: charmap.CodePage1047,
  148. enc2091: charmap.CodePage1140,
  149. enc4: charmap.ISO8859_1,
  150. enc5: charmap.ISO8859_2,
  151. enc6: charmap.ISO8859_3,
  152. enc7: charmap.ISO8859_4,
  153. enc8: charmap.ISO8859_5,
  154. enc9: charmap.ISO8859_6,
  155. enc81: charmap.ISO8859_6E,
  156. enc82: charmap.ISO8859_6I,
  157. enc10: charmap.ISO8859_7,
  158. enc11: charmap.ISO8859_8,
  159. enc84: charmap.ISO8859_8E,
  160. enc85: charmap.ISO8859_8I,
  161. enc12: charmap.ISO8859_9,
  162. enc13: charmap.ISO8859_10,
  163. enc109: charmap.ISO8859_13,
  164. enc110: charmap.ISO8859_14,
  165. enc111: charmap.ISO8859_15,
  166. enc112: charmap.ISO8859_16,
  167. enc2084: charmap.KOI8R,
  168. enc2088: charmap.KOI8U,
  169. enc2027: charmap.Macintosh,
  170. enc2109: charmap.Windows874,
  171. enc2250: charmap.Windows1250,
  172. enc2251: charmap.Windows1251,
  173. enc2252: charmap.Windows1252,
  174. enc2253: charmap.Windows1253,
  175. enc2254: charmap.Windows1254,
  176. enc2255: charmap.Windows1255,
  177. enc2256: charmap.Windows1256,
  178. enc2257: charmap.Windows1257,
  179. enc2258: charmap.Windows1258,
  180. enc18: japanese.EUCJP,
  181. enc39: japanese.ISO2022JP,
  182. enc17: japanese.ShiftJIS,
  183. enc38: korean.EUCKR,
  184. enc114: simplifiedchinese.GB18030,
  185. enc113: simplifiedchinese.GBK,
  186. enc2085: simplifiedchinese.HZGB2312,
  187. enc2026: traditionalchinese.Big5,
  188. }