You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 

144 line
4.7 KiB

  1. // Copyright 2015 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. package simplifiedchinese
  5. import (
  6. "strings"
  7. "testing"
  8. "golang.org/x/text/encoding"
  9. "golang.org/x/text/encoding/internal"
  10. "golang.org/x/text/encoding/internal/enctest"
  11. "golang.org/x/text/transform"
  12. )
  13. func dec(e encoding.Encoding) (dir string, t transform.Transformer, err error) {
  14. return "Decode", e.NewDecoder(), nil
  15. }
  16. func enc(e encoding.Encoding) (dir string, t transform.Transformer, err error) {
  17. return "Encode", e.NewEncoder(), internal.ErrASCIIReplacement
  18. }
  19. func TestNonRepertoire(t *testing.T) {
  20. // Pick n large enough to overflow the destination buffer of transform.String.
  21. const n = 10000
  22. testCases := []struct {
  23. init func(e encoding.Encoding) (string, transform.Transformer, error)
  24. e encoding.Encoding
  25. src, want string
  26. }{
  27. {dec, GBK, "a\xfe\xfeb", "a\ufffdb"},
  28. {dec, HZGB2312, "~{z~", "\ufffd"},
  29. {enc, GBK, "갂", ""},
  30. {enc, GBK, "a갂", "a"},
  31. {enc, GBK, "\u4e02갂", "\x81@"},
  32. {enc, HZGB2312, "갂", ""},
  33. {enc, HZGB2312, "a갂", "a"},
  34. {enc, HZGB2312, "\u6cf5갂", "~{1C~}"},
  35. {dec, GB18030, "\x80", "€"},
  36. {dec, GB18030, "\x81", "\ufffd"},
  37. {dec, GB18030, "\x81\x20", "\ufffd "},
  38. {dec, GB18030, "\xfe\xfe", "\ufffd"},
  39. {dec, GB18030, "\xfe\xff", "\ufffd\ufffd"},
  40. {dec, GB18030, "\xfe\x30", "\ufffd0"},
  41. {dec, GB18030, "\xfe\x30\x30 ", "\ufffd00 "},
  42. {dec, GB18030, "\xfe\x30\xff ", "\ufffd0\ufffd "},
  43. {dec, GB18030, "\xfe\x30\x81\x21", "\ufffd0\ufffd!"},
  44. {dec, GB18030, strings.Repeat("\xfe\x30", n), strings.Repeat("\ufffd0", n)},
  45. {dec, HZGB2312, "~/", "\ufffd"},
  46. {dec, HZGB2312, "~{a\x80", "\ufffd"},
  47. {dec, HZGB2312, "~{a\x80", "\ufffd"},
  48. {dec, HZGB2312, "~{" + strings.Repeat("z~", n), strings.Repeat("\ufffd", n)},
  49. {dec, HZGB2312, "~{" + strings.Repeat("\xfe\x30", n), strings.Repeat("\ufffd", n*2)},
  50. }
  51. for _, tc := range testCases {
  52. dir, tr, wantErr := tc.init(tc.e)
  53. dst, _, err := transform.String(tr, tc.src)
  54. if err != wantErr {
  55. t.Errorf("%s %v(%q): got %v; want %v", dir, tc.e, tc.src, err, wantErr)
  56. }
  57. if got := string(dst); got != tc.want {
  58. t.Errorf("%s %v(%q):\ngot %q\nwant %q", dir, tc.e, tc.src, got, tc.want)
  59. }
  60. }
  61. }
  62. func TestBasics(t *testing.T) {
  63. // The encoded forms can be verified by the iconv program:
  64. // $ echo 月日は百代 | iconv -f UTF-8 -t SHIFT-JIS | xxd
  65. testCases := []struct {
  66. e encoding.Encoding
  67. encPrefix string
  68. encoded string
  69. utf8 string
  70. }{{
  71. // "\u0081\u00de\u00df\u00e0\u00e1\u00e2\u00e3\uffff\U00010000" is a
  72. // nonsense string that contains GB18030 encodable codepoints of which
  73. // only U+00E0 and U+00E1 are GBK encodable.
  74. //
  75. // "A\u3000\u554a\u4e02\u4e90\u72dc\u7349\u02ca\u2588Z€" is a nonsense
  76. // string that contains ASCII and GBK encodable codepoints from Levels
  77. // 1-5 as well as the Euro sign.
  78. //
  79. // "A\u43f0\u4c32\U00027267\u3000\U0002910d\u79d4Z€" is a nonsense string
  80. // that contains ASCII and Big5 encodable codepoints from the Basic
  81. // Multilingual Plane and the Supplementary Ideographic Plane as well as
  82. // the Euro sign.
  83. //
  84. // "花间一壶酒,独酌无相亲。" (simplified) and
  85. // "花間一壺酒,獨酌無相親。" (traditional)
  86. // are from the 8th century poem "Yuè Xià Dú Zhuó".
  87. e: GB18030,
  88. encoded: "\x81\x30\x81\x31\x81\x30\x89\x37\x81\x30\x89\x38\xa8\xa4\xa8\xa2" +
  89. "\x81\x30\x89\x39\x81\x30\x8a\x30\x84\x31\xa4\x39\x90\x30\x81\x30",
  90. utf8: "\u0081\u00de\u00df\u00e0\u00e1\u00e2\u00e3\uffff\U00010000",
  91. }, {
  92. e: GB18030,
  93. encoded: "\xbb\xa8\xbc\xe4\xd2\xbb\xba\xf8\xbe\xc6\xa3\xac\xb6\xc0\xd7\xc3" +
  94. "\xce\xde\xcf\xe0\xc7\xd7\xa1\xa3",
  95. utf8: "花间一壶酒,独酌无相亲。",
  96. }, {
  97. e: GBK,
  98. encoded: "A\xa1\xa1\xb0\xa1\x81\x40\x81\x80\xaa\x40\xaa\x80\xa8\x40\xa8\x80Z\x80",
  99. utf8: "A\u3000\u554a\u4e02\u4e90\u72dc\u7349\u02ca\u2588Z€",
  100. }, {
  101. e: GBK,
  102. encoded: "\xbb\xa8\xbc\xe4\xd2\xbb\xba\xf8\xbe\xc6\xa3\xac\xb6\xc0\xd7\xc3" +
  103. "\xce\xde\xcf\xe0\xc7\xd7\xa1\xa3",
  104. utf8: "花间一壶酒,独酌无相亲。",
  105. }, {
  106. e: HZGB2312,
  107. encoded: "A~{\x21\x21~~\x30\x21~}Z~~",
  108. utf8: "A\u3000~\u554aZ~",
  109. }, {
  110. e: HZGB2312,
  111. encPrefix: "~{",
  112. encoded: ";(<dR;:x>F#,6@WCN^O`GW!#",
  113. utf8: "花间一壶酒,独酌无相亲。",
  114. }}
  115. for _, tc := range testCases {
  116. enctest.TestEncoding(t, tc.e, tc.encoded, tc.utf8, tc.encPrefix, "")
  117. }
  118. }
  119. func TestFiles(t *testing.T) {
  120. enctest.TestFile(t, GB18030)
  121. enctest.TestFile(t, GBK)
  122. enctest.TestFile(t, HZGB2312)
  123. }
  124. func BenchmarkEncoding(b *testing.B) {
  125. enctest.Benchmark(b, GB18030)
  126. enctest.Benchmark(b, GBK)
  127. enctest.Benchmark(b, HZGB2312)
  128. }