Browse Source

Merge pull request #773 from phsmit/golang_x_text_encoding

Golang x text encoding
无闻 11 years ago
parent
commit
e193005c66

+ 12 - 6
models/git_diff.go

@@ -14,12 +14,14 @@ import (
 	"strings"
 	"time"
 
+	"golang.org/x/net/html/charset"
+	"golang.org/x/text/transform"
+
 	"github.com/Unknwon/com"
 
 	"github.com/gogits/gogs/modules/base"
 	"github.com/gogits/gogs/modules/git"
 	"github.com/gogits/gogs/modules/log"
-	"github.com/gogits/gogs/modules/mahonia"
 	"github.com/gogits/gogs/modules/process"
 )
 
@@ -192,14 +194,18 @@ func ParsePatch(pid int64, maxlines int, cmd *exec.Cmd, reader io.Reader) (*Diff
 	}
 
 	// FIXME: use first 30 lines to detect file encoding.
-	charset, err := base.DetectEncoding(buf.Bytes())
-	if charset != "utf8" && err == nil {
-		decoder := mahonia.NewDecoder(charset)
-		if decoder != nil {
+	charsetLabel, err := base.DetectEncoding(buf.Bytes())
+	if charsetLabel != "utf8" && err == nil {
+		encoding, _ := charset.Lookup(charsetLabel)
+
+		if encoding != nil {
+			d := encoding.NewDecoder()
 			for _, f := range diff.Files {
 				for _, sec := range f.Sections {
 					for _, l := range sec.Lines {
-						l.Content = decoder.ConvertString(l.Content)
+						if c, _, err := transform.String(d, l.Content); err == nil {
+							l.Content = c
+						}
 					}
 				}
 			}

+ 19 - 8
modules/base/template.go

@@ -7,14 +7,15 @@ package base
 import (
 	"container/list"
 	"encoding/json"
-	"errors"
 	"fmt"
 	"html/template"
 	"runtime"
 	"strings"
 	"time"
 
-	"github.com/gogits/gogs/modules/mahonia"
+	"golang.org/x/net/html/charset"
+	"golang.org/x/text/transform"
+
 	"github.com/gogits/gogs/modules/setting"
 	"github.com/saintfish/chardet"
 )
@@ -54,20 +55,30 @@ func DetectEncoding(content []byte) (string, error) {
 }
 
 func ToUtf8WithErr(content []byte) (error, string) {
-	charset, err := DetectEncoding(content)
+	charsetLabel, err := DetectEncoding(content)
 	if err != nil {
 		return err, ""
 	}
 
-	if charset == "utf8" {
+	if charsetLabel == "utf8" {
 		return nil, string(content)
 	}
 
-	decoder := mahonia.NewDecoder(charset)
-	if decoder != nil {
-		return nil, decoder.ConvertString(string(content))
+	encoding, _ := charset.Lookup(charsetLabel)
+
+	if encoding == nil {
+		return fmt.Errorf("unknow char decoder %s", charsetLabel), string(content)
 	}
-	return errors.New("unknow char decoder"), string(content)
+
+	result, n, err := transform.String(encoding.NewDecoder(), string(content))
+
+	// If there is an error, we concatenate the nicely decoded part and the
+	// original left over. This way we won't loose data.
+	if err != nil {
+		result = result + string(content[n:])
+	}
+
+	return err, result
 }
 
 func ToUtf8(content string) string {

File diff suppressed because it is too large
+ 0 - 844
modules/mahonia/8bit.go


+ 0 - 76
modules/mahonia/ASCII.go

@@ -1,76 +0,0 @@
-package mahonia
-
-// Converters for ASCII and ISO-8859-1
-
-func init() {
-	for i := 0; i < len(asciiCharsets); i++ {
-		RegisterCharset(&asciiCharsets[i])
-	}
-}
-
-var asciiCharsets = []Charset{
-	{
-		Name:       "US-ASCII",
-		NewDecoder: func() Decoder { return decodeASCIIRune },
-		NewEncoder: func() Encoder { return encodeASCIIRune },
-		Aliases:    []string{"ASCII", "US", "ISO646-US", "IBM367", "cp367", "ANSI_X3.4-1968", "iso-ir-6", "ANSI_X3.4-1986", "ISO_646.irv:1991", "csASCII"},
-	},
-	{
-		Name:       "ISO-8859-1",
-		NewDecoder: func() Decoder { return decodeLatin1Rune },
-		NewEncoder: func() Encoder { return encodeLatin1Rune },
-		Aliases:    []string{"latin1", "ISO Latin 1", "IBM819", "cp819", "ISO_8859-1:1987", "iso-ir-100", "l1", "csISOLatin1"},
-	},
-}
-
-func decodeASCIIRune(p []byte) (c rune, size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	b := p[0]
-	if b > 127 {
-		return 0xfffd, 1, INVALID_CHAR
-	}
-	return rune(b), 1, SUCCESS
-}
-
-func encodeASCIIRune(p []byte, c rune) (size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	if c < 128 {
-		p[0] = byte(c)
-		return 1, SUCCESS
-	}
-
-	p[0] = '?'
-	return 1, INVALID_CHAR
-}
-
-func decodeLatin1Rune(p []byte) (c rune, size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	return rune(p[0]), 1, SUCCESS
-}
-
-func encodeLatin1Rune(p []byte, c rune) (size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	if c < 256 {
-		p[0] = byte(c)
-		return 1, SUCCESS
-	}
-
-	p[0] = '?'
-	return 1, INVALID_CHAR
-}

File diff suppressed because it is too large
+ 0 - 13707
modules/mahonia/big5-data.go


+ 0 - 89
modules/mahonia/big5.go

@@ -1,89 +0,0 @@
-package mahonia
-
-// Converters for Big 5 encoding.
-
-import (
-	"sync"
-)
-
-func init() {
-	RegisterCharset(&Charset{
-		Name:    "Big5",
-		Aliases: []string{"csBig5"},
-		NewDecoder: func() Decoder {
-			return decodeBig5Rune
-		},
-		NewEncoder: func() Encoder {
-			big5Once.Do(reverseBig5Table)
-			return encodeBig5Rune
-		},
-	})
-}
-
-func decodeBig5Rune(p []byte) (r rune, size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	b := p[0]
-	if b < 128 {
-		return rune(b), 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		status = NO_ROOM
-		return
-	}
-
-	c := int(p[0])<<8 + int(p[1])
-	c = int(big5ToUnicode[c])
-	if c > 0 {
-		return rune(c), 2, SUCCESS
-	}
-
-	return 0xfffd, 1, INVALID_CHAR
-}
-
-func encodeBig5Rune(p []byte, r rune) (size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	if r < 128 {
-		p[0] = byte(r)
-		return 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		status = NO_ROOM
-		return
-	}
-
-	if r < 0x10000 {
-		c := unicodeToBig5[r]
-		if c > 0 {
-			p[0] = byte(c >> 8)
-			p[1] = byte(c)
-			return 2, SUCCESS
-		}
-	}
-
-	p[0] = '?'
-	return 1, INVALID_CHAR
-}
-
-var big5Once sync.Once
-
-var unicodeToBig5 []uint16
-
-func reverseBig5Table() {
-	unicodeToBig5 = make([]uint16, 65536)
-
-	for big5, unicode := range big5ToUnicode {
-		if unicode > 0 {
-			unicodeToBig5[unicode] = uint16(big5)
-		}
-	}
-}

+ 0 - 115
modules/mahonia/charset.go

@@ -1,115 +0,0 @@
-// This package is a character-set conversion library for Go.
-//
-// (DEPRECATED: use code.google.com/p/go.text/encoding, perhaps along with
-// code.google.com/p/go.net/html/charset)
-package mahonia
-
-import (
-	"bytes"
-	"unicode"
-)
-
-// Status is the type for the status return value from a Decoder or Encoder.
-type Status int
-
-const (
-	// SUCCESS means that the character was converted with no problems.
-	SUCCESS = Status(iota)
-
-	// INVALID_CHAR means that the source contained invalid bytes, or that the character
-	// could not be represented in the destination encoding.
-	// The Encoder or Decoder should have output a substitute character.
-	INVALID_CHAR
-
-	// NO_ROOM means there were not enough input bytes to form a complete character,
-	// or there was not enough room in the output buffer to write a complete character.
-	// No bytes were written, and no internal state was changed in the Encoder or Decoder.
-	NO_ROOM
-
-	// STATE_ONLY means that bytes were read or written indicating a state transition,
-	// but no actual character was processed. (Examples: byte order marks, ISO-2022 escape sequences)
-	STATE_ONLY
-)
-
-// A Decoder is a function that decodes a character set, one character at a time.
-// It works much like utf8.DecodeRune, but has an additional status return value.
-type Decoder func(p []byte) (c rune, size int, status Status)
-
-// An Encoder is a function that encodes a character set, one character at a time.
-// It works much like utf8.EncodeRune, but has an additional status return value.
-type Encoder func(p []byte, c rune) (size int, status Status)
-
-// A Charset represents a character set that can be converted, and contains functions
-// to create Converters to encode and decode strings in that character set.
-type Charset struct {
-	// Name is the character set's canonical name.
-	Name string
-
-	// Aliases returns a list of alternate names.
-	Aliases []string
-
-	// NewDecoder returns a Decoder to convert from the charset to Unicode.
-	NewDecoder func() Decoder
-
-	// NewEncoder returns an Encoder to convert from Unicode to the charset.
-	NewEncoder func() Encoder
-}
-
-// The charsets are stored in charsets under their canonical names.
-var charsets = make(map[string]*Charset)
-
-// aliases maps their aliases to their canonical names.
-var aliases = make(map[string]string)
-
-// simplifyName converts a name to lower case and removes non-alphanumeric characters.
-// This is how the names are used as keys to the maps.
-func simplifyName(name string) string {
-	var buf bytes.Buffer
-	for _, c := range name {
-		switch {
-		case unicode.IsDigit(c):
-			buf.WriteRune(c)
-		case unicode.IsLetter(c):
-			buf.WriteRune(unicode.ToLower(c))
-		default:
-
-		}
-	}
-
-	return buf.String()
-}
-
-// RegisterCharset adds a charset to the charsetMap.
-func RegisterCharset(cs *Charset) {
-	name := cs.Name
-	charsets[name] = cs
-	aliases[simplifyName(name)] = name
-	for _, alias := range cs.Aliases {
-		aliases[simplifyName(alias)] = name
-	}
-}
-
-// GetCharset fetches a charset by name.
-// If the name is not found, it returns nil.
-func GetCharset(name string) *Charset {
-	return charsets[aliases[simplifyName(name)]]
-}
-
-// NewDecoder returns a Decoder to decode the named charset.
-// If the name is not found, it returns nil.
-func NewDecoder(name string) Decoder {
-	cs := GetCharset(name)
-	if cs == nil {
-		return nil
-	}
-	return cs.NewDecoder()
-}
-
-// NewEncoder returns an Encoder to encode the named charset.
-func NewEncoder(name string) Encoder {
-	cs := GetCharset(name)
-	if cs == nil {
-		return nil
-	}
-	return cs.NewEncoder()
-}

+ 0 - 135
modules/mahonia/convert_string.go

@@ -1,135 +0,0 @@
-package mahonia
-
-import (
-	"unicode/utf8"
-)
-
-// ConvertString converts a  string from UTF-8 to e's encoding.
-func (e Encoder) ConvertString(s string) string {
-	dest := make([]byte, len(s)+10)
-	destPos := 0
-
-	for _, rune := range s {
-	retry:
-		size, status := e(dest[destPos:], rune)
-
-		if status == NO_ROOM {
-			newDest := make([]byte, len(dest)*2)
-			copy(newDest, dest)
-			dest = newDest
-			goto retry
-		}
-
-		if status == STATE_ONLY {
-			destPos += size
-			goto retry
-		}
-
-		destPos += size
-	}
-
-	return string(dest[:destPos])
-}
-
-// ConvertString converts a string from d's encoding to UTF-8.
-func (d Decoder) ConvertString(s string) string {
-	bytes := []byte(s)
-	runes := make([]rune, len(s))
-	destPos := 0
-
-	for len(bytes) > 0 {
-		c, size, status := d(bytes)
-
-		if status == STATE_ONLY {
-			bytes = bytes[size:]
-			continue
-		}
-
-		if status == NO_ROOM {
-			c = 0xfffd
-			size = len(bytes)
-			status = INVALID_CHAR
-		}
-
-		bytes = bytes[size:]
-		runes[destPos] = c
-		destPos++
-	}
-
-	return string(runes[:destPos])
-}
-
-// ConvertStringOK converts a  string from UTF-8 to e's encoding. It also
-// returns a boolean indicating whether every character was converted
-// successfully.
-func (e Encoder) ConvertStringOK(s string) (result string, ok bool) {
-	dest := make([]byte, len(s)+10)
-	destPos := 0
-	ok = true
-
-	for i, r := range s {
-		// The following test is copied from utf8.ValidString.
-		if r == utf8.RuneError && ok {
-			_, size := utf8.DecodeRuneInString(s[i:])
-			if size == 1 {
-				ok = false
-			}
-		}
-
-	retry:
-		size, status := e(dest[destPos:], r)
-
-		switch status {
-		case NO_ROOM:
-			newDest := make([]byte, len(dest)*2)
-			copy(newDest, dest)
-			dest = newDest
-			goto retry
-
-		case STATE_ONLY:
-			destPos += size
-			goto retry
-
-		case INVALID_CHAR:
-			ok = false
-		}
-
-		destPos += size
-	}
-
-	return string(dest[:destPos]), ok
-}
-
-// ConvertStringOK converts a string from d's encoding to UTF-8.
-// It also returns a boolean indicating whether every character was converted
-// successfully.
-func (d Decoder) ConvertStringOK(s string) (result string, ok bool) {
-	bytes := []byte(s)
-	runes := make([]rune, len(s))
-	destPos := 0
-	ok = true
-
-	for len(bytes) > 0 {
-		c, size, status := d(bytes)
-
-		switch status {
-		case STATE_ONLY:
-			bytes = bytes[size:]
-			continue
-
-		case NO_ROOM:
-			c = 0xfffd
-			size = len(bytes)
-			ok = false
-
-		case INVALID_CHAR:
-			ok = false
-		}
-
-		bytes = bytes[size:]
-		runes[destPos] = c
-		destPos++
-	}
-
-	return string(runes[:destPos]), ok
-}

+ 0 - 76
modules/mahonia/cp51932.go

@@ -1,76 +0,0 @@
-package mahonia
-
-import (
-	"unicode/utf8"
-)
-
-// Converters for Microsoft's version of the EUC-JP encoding
-
-func init() {
-	RegisterCharset(&Charset{
-		Name:    "cp51932",
-		Aliases: []string{"windows-51932"},
-		NewDecoder: func() Decoder {
-			return decodeCP51932
-		},
-		NewEncoder: func() Encoder {
-			msJISTable.Reverse()
-			return encodeCP51932
-		},
-	})
-}
-
-func decodeCP51932(p []byte) (c rune, size int, status Status) {
-	if len(p) == 0 {
-		return 0, 0, NO_ROOM
-	}
-
-	b := p[0]
-	switch {
-	case b < 0x80:
-		return rune(b), 1, SUCCESS
-
-	case b == 0x8e:
-		if len(p) < 2 {
-			return 0, 0, NO_ROOM
-		}
-		b2 := p[1]
-		if b2 < 0xa1 || b2 > 0xdf {
-			return utf8.RuneError, 1, INVALID_CHAR
-		}
-		return rune(b2) + (0xff61 - 0xa1), 2, SUCCESS
-
-	case 0xa1 <= b && b <= 0xfe:
-		return msJISTable.DecodeHigh(p)
-	}
-
-	return utf8.RuneError, 1, INVALID_CHAR
-}
-
-func encodeCP51932(p []byte, c rune) (size int, status Status) {
-	if len(p) == 0 {
-		return 0, NO_ROOM
-	}
-
-	if c < 0x80 {
-		p[0] = byte(c)
-		return 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		return 0, NO_ROOM
-	}
-
-	if c > 0xffff {
-		p[0] = '?'
-		return 1, INVALID_CHAR
-	}
-
-	if 0xff61 <= c && c <= 0xff9f {
-		p[0] = 0x8e
-		p[1] = byte(c - (0xff61 - 0xa1))
-		return 2, SUCCESS
-	}
-
-	return msJISTable.EncodeHigh(p, c)
-}

+ 0 - 179
modules/mahonia/entity.go

@@ -1,179 +0,0 @@
-package mahonia
-
-// decoding HTML entities
-
-import (
-	"sort"
-)
-
-// EntityDecoder returns a Decoder that decodes HTML character entities.
-// If there is no valid character entity at the current position, it returns INVALID_CHAR.
-// So it needs to be combined with another Decoder via FallbackDecoder.
-func EntityDecoder() Decoder {
-	var leftover rune // leftover rune from two-rune entity
-	return func(p []byte) (r rune, size int, status Status) {
-		if leftover != 0 {
-			r = leftover
-			leftover = 0
-			return r, 0, SUCCESS
-		}
-
-		if len(p) == 0 {
-			return 0, 0, NO_ROOM
-		}
-
-		if p[0] != '&' {
-			return 0xfffd, 1, INVALID_CHAR
-		}
-
-		if len(p) < 3 {
-			return 0, 1, NO_ROOM
-		}
-
-		r, size, status = 0xfffd, 1, INVALID_CHAR
-		n := 1 // number of bytes read so far
-
-		if p[n] == '#' {
-			n++
-			c := p[n]
-			hex := false
-			if c == 'x' || c == 'X' {
-				hex = true
-				n++
-			}
-
-			var x rune
-			for n < len(p) {
-				c = p[n]
-				n++
-				if hex {
-					if '0' <= c && c <= '9' {
-						x = 16*x + rune(c) - '0'
-						continue
-					} else if 'a' <= c && c <= 'f' {
-						x = 16*x + rune(c) - 'a' + 10
-						continue
-					} else if 'A' <= c && c <= 'F' {
-						x = 16*x + rune(c) - 'A' + 10
-						continue
-					}
-				} else if '0' <= c && c <= '9' {
-					x = 10*x + rune(c) - '0'
-					continue
-				}
-				if c != ';' {
-					n--
-				}
-				break
-			}
-
-			if n == len(p) && p[n-1] != ';' {
-				return 0, 0, NO_ROOM
-			}
-
-			size = n
-			if p[n-1] == ';' {
-				n--
-			}
-			if hex {
-				n--
-			}
-			n--
-			// Now n is the number of actual digits read.
-			if n == 0 {
-				return 0xfffd, 1, INVALID_CHAR
-			}
-
-			if 0x80 <= x && x <= 0x9F {
-				// Replace characters from Windows-1252 with UTF-8 equivalents.
-				x = replacementTable[x-0x80]
-			} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
-				// Replace invalid characters with the replacement character.
-				return 0xfffd, size, INVALID_CHAR
-			}
-
-			r = x
-			status = SUCCESS
-			return
-		}
-
-		// Look for a named entity in EntityList.
-
-		possible := entityList
-		for len(possible) > 0 {
-			if len(p) <= n {
-				leftover = 0
-				return 0, 0, NO_ROOM
-			}
-
-			c := p[n]
-
-			// Narrow down the selection in possible to those items that have c in the
-			// appropriate byte.
-			first := sort.Search(len(possible), func(i int) bool {
-				e := possible[i].name
-				if len(e) < n {
-					return false
-				}
-				return e[n-1] >= c
-			})
-			possible = possible[first:]
-			last := sort.Search(len(possible), func(i int) bool {
-				return possible[i].name[n-1] > c
-			})
-			possible = possible[:last]
-
-			n++
-			if len(possible) > 0 && len(possible[0].name) == n-1 {
-				r, leftover = possible[0].r1, possible[0].r2
-				size = n
-				status = SUCCESS
-				// but don't return yet, since we need the longest match
-			}
-		}
-
-		return
-	}
-}
-
-// This table is copied from /src/pkg/html/escape.go in the Go source
-//
-// These replacements permit compatibility with old numeric entities that
-// assumed Windows-1252 encoding.
-// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
-var replacementTable = [...]rune{
-	'\u20AC', // First entry is what 0x80 should be replaced with.
-	'\u0081',
-	'\u201A',
-	'\u0192',
-	'\u201E',
-	'\u2026',
-	'\u2020',
-	'\u2021',
-	'\u02C6',
-	'\u2030',
-	'\u0160',
-	'\u2039',
-	'\u0152',
-	'\u008D',
-	'\u017D',
-	'\u008F',
-	'\u0090',
-	'\u2018',
-	'\u2019',
-	'\u201C',
-	'\u201D',
-	'\u2022',
-	'\u2013',
-	'\u2014',
-	'\u02DC',
-	'\u2122',
-	'\u0161',
-	'\u203A',
-	'\u0153',
-	'\u009D',
-	'\u017E',
-	'\u0178', // Last entry is 0x9F.
-	// 0x00->'\uFFFD' is handled programmatically.
-	// 0x0D->'\u000D' is a no-op.
-}

File diff suppressed because it is too large
+ 0 - 2254
modules/mahonia/entity_data.go


+ 0 - 102
modules/mahonia/euc-jp.go

@@ -1,102 +0,0 @@
-package mahonia
-
-import (
-	"unicode/utf8"
-)
-
-// Converters for the EUC-JP encoding
-
-func init() {
-	RegisterCharset(&Charset{
-		Name:    "EUC-JP",
-		Aliases: []string{"extended_unix_code_packed_format_for_japanese", "cseucpkdfmtjapanese"},
-		NewDecoder: func() Decoder {
-			return decodeEucJP
-		},
-		NewEncoder: func() Encoder {
-			jis0208Table.Reverse()
-			jis0212Table.Reverse()
-			return encodeEucJP
-		},
-	})
-}
-
-func decodeEucJP(p []byte) (c rune, size int, status Status) {
-	if len(p) == 0 {
-		return 0, 0, NO_ROOM
-	}
-
-	b := p[0]
-	switch {
-	case b < 0x80:
-		return rune(b), 1, SUCCESS
-
-	case b == 0x8e:
-		if len(p) < 2 {
-			return 0, 0, NO_ROOM
-		}
-		b2 := p[1]
-		if b2 < 0xa1 || b2 > 0xdf {
-			return utf8.RuneError, 1, INVALID_CHAR
-		}
-		return rune(b2) + (0xff61 - 0xa1), 2, SUCCESS
-
-	case b == 0x8f:
-		if len(p) < 3 {
-			return 0, 0, NO_ROOM
-		}
-		c, size, status = jis0212Table.DecodeHigh(p[1:3])
-		if status == SUCCESS {
-			size = 3
-		}
-		return
-
-	case 0xa1 <= b && b <= 0xfe:
-		return jis0208Table.DecodeHigh(p)
-	}
-
-	return utf8.RuneError, 1, INVALID_CHAR
-}
-
-func encodeEucJP(p []byte, c rune) (size int, status Status) {
-	if len(p) == 0 {
-		return 0, NO_ROOM
-	}
-
-	if c < 0x80 {
-		p[0] = byte(c)
-		return 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		return 0, NO_ROOM
-	}
-
-	if c > 0xffff {
-		p[0] = '?'
-		return 1, INVALID_CHAR
-	}
-
-	if 0xff61 <= c && c <= 0xff9f {
-		p[0] = 0x8e
-		p[1] = byte(c - (0xff61 - 0xa1))
-		return 2, SUCCESS
-	}
-
-	size, status = jis0208Table.EncodeHigh(p, c)
-	if status == SUCCESS {
-		return size, status
-	}
-
-	size, status = jis0212Table.EncodeHigh(p[1:], c)
-	switch status {
-	case SUCCESS:
-		p[0] = 0x8f
-		return size + 1, SUCCESS
-
-	case INVALID_CHAR:
-		p[0] = '?'
-		return 1, INVALID_CHAR
-	}
-	return size, status
-}

File diff suppressed because it is too large
+ 0 - 17072
modules/mahonia/euc-kr-data.go


+ 0 - 89
modules/mahonia/euc-kr.go

@@ -1,89 +0,0 @@
-package mahonia
-
-// Converters for the EUC-KR encoding.
-
-import (
-	"unicode/utf8"
-)
-
-func init() {
-	RegisterCharset(&Charset{
-		Name: "EUC-KR",
-		Aliases: []string{
-			"ibm-1363",
-			"KS_C_5601-1987",
-			"KS_C_5601-1989",
-			"KSC_5601",
-			"Korean",
-			"iso-ir-149",
-			"cp1363",
-			"5601",
-			"ksc",
-			"windows-949",
-			"ibm-970",
-			"cp970",
-			"970",
-			"cp949",
-		},
-		NewDecoder: func() Decoder {
-			return decodeEucKr
-		},
-		NewEncoder: func() Encoder {
-			eucKrOnce.Do(reverseEucKrTable)
-			return encodeEucKr
-		},
-	})
-}
-
-func decodeEucKr(p []byte) (c rune, size int, status Status) {
-	if len(p) == 0 {
-		return 0, 0, NO_ROOM
-	}
-
-	b := p[0]
-	if b < 0x80 {
-		return rune(b), 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		return 0, 0, NO_ROOM
-	}
-
-	euc := int(b)<<8 + int(p[1])
-	c = rune(eucKrToUnicode[euc])
-
-	if c == 0 {
-		return utf8.RuneError, 2, INVALID_CHAR
-	}
-	return c, 2, SUCCESS
-}
-
-func encodeEucKr(p []byte, c rune) (size int, status Status) {
-	if len(p) == 0 {
-		return 0, NO_ROOM
-	}
-
-	if c < 0x80 {
-		p[0] = byte(c)
-		return 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		return 0, NO_ROOM
-	}
-
-	if c > 0xffff {
-		p[0] = '?'
-		return 1, INVALID_CHAR
-	}
-
-	euc := unicodeToEucKr[c]
-	if euc == 0 {
-		p[0] = '?'
-		return 1, INVALID_CHAR
-	}
-
-	p[0] = byte(euc >> 8)
-	p[1] = byte(euc)
-	return 2, SUCCESS
-}

+ 0 - 19
modules/mahonia/fallback.go

@@ -1,19 +0,0 @@
-package mahonia
-
-// FallbackDecoder combines a series of Decoders into one.
-// If the first Decoder returns a status of INVALID_CHAR, the others are tried as well.
-//
-// Note: if the text to be decoded ends with a sequence of bytes that is not a valid character in the first charset,
-// but it could be the beginning of a valid character, the FallbackDecoder will give a status of NO_ROOM instead of
-// falling back to the other Decoders.
-func FallbackDecoder(decoders ...Decoder) Decoder {
-	return func(p []byte) (c rune, size int, status Status) {
-		for _, d := range decoders {
-			c, size, status = d(p)
-			if status != INVALID_CHAR {
-				return
-			}
-		}
-		return 0, 1, INVALID_CHAR
-	}
-}

File diff suppressed because it is too large
+ 0 - 6839
modules/mahonia/gb18030-data.go


+ 0 - 156
modules/mahonia/gb18030.go

@@ -1,156 +0,0 @@
-package mahonia
-
-import (
-	"sync"
-)
-
-// Converters for GB18030 encoding.
-
-func init() {
-	RegisterCharset(&Charset{
-		Name: "GB18030",
-		NewDecoder: func() Decoder {
-			gb18030Once.Do(buildGB18030Tables)
-			return decodeGB18030Rune
-		},
-		NewEncoder: func() Encoder {
-			gb18030Once.Do(buildGB18030Tables)
-			return encodeGB18030Rune
-		},
-	})
-}
-
-func decodeGB18030Rune(p []byte) (r rune, size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	b := p[0]
-	if b < 128 {
-		return rune(b), 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		status = NO_ROOM
-		return
-	}
-
-	if p[0] < 0x81 || p[0] > 0xfe {
-		return 0xfffd, 1, INVALID_CHAR
-	}
-
-	if p[1] >= 0x40 {
-		// 2-byte character
-		c := uint16(p[0])<<8 + uint16(p[1])
-		r = rune(gbkToUnicode[c])
-		if r == 0 {
-			r = gbkToUnicodeExtra[c]
-		}
-
-		if r != 0 {
-			return r, 2, SUCCESS
-		}
-	} else if p[1] >= 0x30 {
-		// 4-byte character
-		if len(p) < 4 {
-			return 0, 0, NO_ROOM
-		}
-		if p[2] < 0x81 || p[2] > 0xfe || p[3] < 0x30 || p[3] > 0x39 {
-			return 0xfffd, 1, INVALID_CHAR
-		}
-
-		code := uint32(p[0])<<24 + uint32(p[1])<<16 + uint32(p[2])<<8 + uint32(p[3])
-		lin := gb18030Linear(code)
-
-		if lin <= maxGB18030Linear {
-			r = rune(gb18030LinearToUnicode[lin])
-			if r != 0 {
-				return r, 4, SUCCESS
-			}
-		}
-
-		for _, rng := range gb18030Ranges {
-			if lin >= rng.firstGB && lin <= rng.lastGB {
-				return rng.firstRune + rune(lin) - rune(rng.firstGB), 4, SUCCESS
-			}
-		}
-	}
-
-	return 0xfffd, 1, INVALID_CHAR
-}
-
-func encodeGB18030Rune(p []byte, r rune) (size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	if r < 128 {
-		p[0] = byte(r)
-		return 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		status = NO_ROOM
-		return
-	}
-
-	var c uint16
-	if r < 0x10000 {
-		c = unicodeToGBK[r]
-	} else {
-		c = unicodeToGBKExtra[r]
-	}
-
-	if c != 0 {
-		p[0] = byte(c >> 8)
-		p[1] = byte(c)
-		return 2, SUCCESS
-	}
-
-	if len(p) < 4 {
-		return 0, NO_ROOM
-	}
-
-	if r < 0x10000 {
-		f := unicodeToGB18030[r]
-		if f != 0 {
-			p[0] = byte(f >> 24)
-			p[1] = byte(f >> 16)
-			p[2] = byte(f >> 8)
-			p[3] = byte(f)
-			return 4, SUCCESS
-		}
-	}
-
-	for _, rng := range gb18030Ranges {
-		if r >= rng.firstRune && r <= rng.lastRune {
-			lin := rng.firstGB + uint32(r) - uint32(rng.firstRune)
-			p[0] = byte(lin/(10*126*10)) + 0x81
-			p[1] = byte(lin/(126*10)%10) + 0x30
-			p[2] = byte(lin/10%126) + 0x81
-			p[3] = byte(lin%10) + 0x30
-			return 4, SUCCESS
-		}
-	}
-
-	p[0] = 0x1a
-	return 1, INVALID_CHAR
-}
-
-var gb18030Once sync.Once
-
-// Mapping from gb18039Linear values to Unicode.
-var gb18030LinearToUnicode []uint16
-
-var unicodeToGB18030 []uint32
-
-func buildGB18030Tables() {
-	gb18030LinearToUnicode = make([]uint16, maxGB18030Linear+1)
-	unicodeToGB18030 = make([]uint32, 65536)
-	for _, data := range gb18030Data {
-		gb18030LinearToUnicode[gb18030Linear(data.gb18030)] = data.unicode
-		unicodeToGB18030[data.unicode] = data.gb18030
-	}
-}

File diff suppressed because it is too large
+ 0 - 47922
modules/mahonia/gbk-data.go


+ 0 - 78
modules/mahonia/gbk.go

@@ -1,78 +0,0 @@
-package mahonia
-
-// Converters for GBK encoding.
-
-func init() {
-	RegisterCharset(&Charset{
-		Name:    "GBK",
-		Aliases: []string{"GB2312"}, // GBK is a superset of GB2312.
-		NewDecoder: func() Decoder {
-			return decodeGBKRune
-		},
-		NewEncoder: func() Encoder {
-			return encodeGBKRune
-		},
-	})
-}
-
-func decodeGBKRune(p []byte) (r rune, size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	b := p[0]
-	if b < 128 {
-		return rune(b), 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		status = NO_ROOM
-		return
-	}
-
-	c := uint16(p[0])<<8 + uint16(p[1])
-	r = rune(gbkToUnicode[c])
-	if r == 0 {
-		r = gbkToUnicodeExtra[c]
-	}
-
-	if r != 0 {
-		return r, 2, SUCCESS
-	}
-
-	return 0xfffd, 1, INVALID_CHAR
-}
-
-func encodeGBKRune(p []byte, r rune) (size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	if r < 128 {
-		p[0] = byte(r)
-		return 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		status = NO_ROOM
-		return
-	}
-
-	var c uint16
-	if r < 0x10000 {
-		c = unicodeToGBK[r]
-	} else {
-		c = unicodeToGBKExtra[r]
-	}
-
-	if c != 0 {
-		p[0] = byte(c >> 8)
-		p[1] = byte(c)
-		return 2, SUCCESS
-	}
-
-	p[0] = 0x1a
-	return 1, INVALID_CHAR
-}

+ 0 - 124
modules/mahonia/iso2022jp.go

@@ -1,124 +0,0 @@
-package mahonia
-
-import (
-	"unicode/utf8"
-)
-
-// converters for ISO-2022-JP encoding
-
-const esc = 27
-
-func init() {
-	type jpEncoding int
-	const (
-		ascii jpEncoding = iota
-		jisX0201Roman
-		jisX0208
-	)
-
-	RegisterCharset(&Charset{
-		Name: "ISO-2022-JP",
-		NewDecoder: func() Decoder {
-			encoding := ascii
-			return func(p []byte) (c rune, size int, status Status) {
-				if len(p) == 0 {
-					return 0, 0, NO_ROOM
-				}
-
-				b := p[0]
-				if b == esc {
-					if len(p) < 3 {
-						return 0, 0, NO_ROOM
-					}
-					switch p[1] {
-					case '(':
-						switch p[2] {
-						case 'B':
-							encoding = ascii
-							return 0, 3, STATE_ONLY
-
-						case 'J':
-							encoding = jisX0201Roman
-							return 0, 3, STATE_ONLY
-						}
-
-					case '$':
-						switch p[2] {
-						case '@', 'B':
-							encoding = jisX0208
-							return 0, 3, STATE_ONLY
-						}
-					}
-				}
-
-				switch encoding {
-				case ascii:
-					if b > 127 {
-						return utf8.RuneError, 1, INVALID_CHAR
-					}
-					return rune(b), 1, SUCCESS
-
-				case jisX0201Roman:
-					if b > 127 {
-						return utf8.RuneError, 1, INVALID_CHAR
-					}
-					switch b {
-					case '\\':
-						return 0xA5, 1, SUCCESS
-					case '~':
-						return 0x203E, 1, SUCCESS
-					}
-					return rune(b), 1, SUCCESS
-
-				case jisX0208:
-					return jis0208Table.DecodeLow(p)
-				}
-				panic("unreachable")
-			}
-		},
-		NewEncoder: func() Encoder {
-			jis0208Table.Reverse()
-			encoding := ascii
-			return func(p []byte, c rune) (size int, status Status) {
-				if len(p) == 0 {
-					return 0, NO_ROOM
-				}
-
-				if c < 128 {
-					if encoding != ascii {
-						if len(p) < 4 {
-							return 0, NO_ROOM
-						}
-						p[0], p[1], p[2] = esc, '(', 'B'
-						p[3] = byte(c)
-						encoding = ascii
-						return 4, SUCCESS
-					}
-					p[0] = byte(c)
-					return 1, SUCCESS
-				}
-
-				if c > 65535 {
-					return 0, INVALID_CHAR
-				}
-				jis := jis0208Table.FromUnicode[c]
-				if jis == [2]byte{0, 0} && c != rune(jis0208Table.Data[0][0]) {
-					return 0, INVALID_CHAR
-				}
-
-				if encoding != jisX0208 {
-					if len(p) < 3 {
-						return 0, NO_ROOM
-					}
-					p[0], p[1], p[2] = esc, '$', 'B'
-					encoding = jisX0208
-					return 3, STATE_ONLY
-				}
-
-				p[0] = jis[0] + 0x21
-				p[1] = jis[1] + 0x21
-				return 2, SUCCESS
-			}
-		},
-	})
-}

+ 0 - 162
modules/mahonia/jis0201-data.go

@@ -1,162 +0,0 @@
-package mahonia
-
-var jis0201ToUnicode = [256]uint16{
-	0x20: 0x0020, // SPACE
-	0x21: 0x0021, // EXCLAMATION MARK
-	0x22: 0x0022, // QUOTATION MARK
-	0x23: 0x0023, // NUMBER SIGN
-	0x24: 0x0024, // DOLLAR SIGN
-	0x25: 0x0025, // PERCENT SIGN
-	0x26: 0x0026, // AMPERSAND
-	0x27: 0x0027, // APOSTROPHE
-	0x28: 0x0028, // LEFT PARENTHESIS
-	0x29: 0x0029, // RIGHT PARENTHESIS
-	0x2A: 0x002A, // ASTERISK
-	0x2B: 0x002B, // PLUS SIGN
-	0x2C: 0x002C, // COMMA
-	0x2D: 0x002D, // HYPHEN-MINUS
-	0x2E: 0x002E, // FULL STOP
-	0x2F: 0x002F, // SOLIDUS
-	0x30: 0x0030, // DIGIT ZERO
-	0x31: 0x0031, // DIGIT ONE
-	0x32: 0x0032, // DIGIT TWO
-	0x33: 0x0033, // DIGIT THREE
-	0x34: 0x0034, // DIGIT FOUR
-	0x35: 0x0035, // DIGIT FIVE
-	0x36: 0x0036, // DIGIT SIX
-	0x37: 0x0037, // DIGIT SEVEN
-	0x38: 0x0038, // DIGIT EIGHT
-	0x39: 0x0039, // DIGIT NINE
-	0x3A: 0x003A, // COLON
-	0x3B: 0x003B, // SEMICOLON
-	0x3C: 0x003C, // LESS-THAN SIGN
-	0x3D: 0x003D, // EQUALS SIGN
-	0x3E: 0x003E, // GREATER-THAN SIGN
-	0x3F: 0x003F, // QUESTION MARK
-	0x40: 0x0040, // COMMERCIAL AT
-	0x41: 0x0041, // LATIN CAPITAL LETTER A
-	0x42: 0x0042, // LATIN CAPITAL LETTER B
-	0x43: 0x0043, // LATIN CAPITAL LETTER C
-	0x44: 0x0044, // LATIN CAPITAL LETTER D
-	0x45: 0x0045, // LATIN CAPITAL LETTER E
-	0x46: 0x0046, // LATIN CAPITAL LETTER F
-	0x47: 0x0047, // LATIN CAPITAL LETTER G
-	0x48: 0x0048, // LATIN CAPITAL LETTER H
-	0x49: 0x0049, // LATIN CAPITAL LETTER I
-	0x4A: 0x004A, // LATIN CAPITAL LETTER J
-	0x4B: 0x004B, // LATIN CAPITAL LETTER K
-	0x4C: 0x004C, // LATIN CAPITAL LETTER L
-	0x4D: 0x004D, // LATIN CAPITAL LETTER M
-	0x4E: 0x004E, // LATIN CAPITAL LETTER N
-	0x4F: 0x004F, // LATIN CAPITAL LETTER O
-	0x50: 0x0050, // LATIN CAPITAL LETTER P
-	0x51: 0x0051, // LATIN CAPITAL LETTER Q
-	0x52: 0x0052, // LATIN CAPITAL LETTER R
-	0x53: 0x0053, // LATIN CAPITAL LETTER S
-	0x54: 0x0054, // LATIN CAPITAL LETTER T
-	0x55: 0x0055, // LATIN CAPITAL LETTER U
-	0x56: 0x0056, // LATIN CAPITAL LETTER V
-	0x57: 0x0057, // LATIN CAPITAL LETTER W
-	0x58: 0x0058, // LATIN CAPITAL LETTER X
-	0x59: 0x0059, // LATIN CAPITAL LETTER Y
-	0x5A: 0x005A, // LATIN CAPITAL LETTER Z
-	0x5B: 0x005B, // LEFT SQUARE BRACKET
-	0x5C: 0x00A5, // YEN SIGN
-	0x5D: 0x005D, // RIGHT SQUARE BRACKET
-	0x5E: 0x005E, // CIRCUMFLEX ACCENT
-	0x5F: 0x005F, // LOW LINE
-	0x60: 0x0060, // GRAVE ACCENT
-	0x61: 0x0061, // LATIN SMALL LETTER A
-	0x62: 0x0062, // LATIN SMALL LETTER B
-	0x63: 0x0063, // LATIN SMALL LETTER C
-	0x64: 0x0064, // LATIN SMALL LETTER D
-	0x65: 0x0065, // LATIN SMALL LETTER E
-	0x66: 0x0066, // LATIN SMALL LETTER F
-	0x67: 0x0067, // LATIN SMALL LETTER G
-	0x68: 0x0068, // LATIN SMALL LETTER H
-	0x69: 0x0069, // LATIN SMALL LETTER I
-	0x6A: 0x006A, // LATIN SMALL LETTER J
-	0x6B: 0x006B, // LATIN SMALL LETTER K
-	0x6C: 0x006C, // LATIN SMALL LETTER L
-	0x6D: 0x006D, // LATIN SMALL LETTER M
-	0x6E: 0x006E, // LATIN SMALL LETTER N
-	0x6F: 0x006F, // LATIN SMALL LETTER O
-	0x70: 0x0070, // LATIN SMALL LETTER P
-	0x71: 0x0071, // LATIN SMALL LETTER Q
-	0x72: 0x0072, // LATIN SMALL LETTER R
-	0x73: 0x0073, // LATIN SMALL LETTER S
-	0x74: 0x0074, // LATIN SMALL LETTER T
-	0x75: 0x0075, // LATIN SMALL LETTER U
-	0x76: 0x0076, // LATIN SMALL LETTER V
-	0x77: 0x0077, // LATIN SMALL LETTER W
-	0x78: 0x0078, // LATIN SMALL LETTER X
-	0x79: 0x0079, // LATIN SMALL LETTER Y
-	0x7A: 0x007A, // LATIN SMALL LETTER Z
-	0x7B: 0x007B, // LEFT CURLY BRACKET
-	0x7C: 0x007C, // VERTICAL LINE
-	0x7D: 0x007D, // RIGHT CURLY BRACKET
-	0x7E: 0x203E, // OVERLINE
-	0xA1: 0xFF61, // HALFWIDTH IDEOGRAPHIC FULL STOP
-	0xA2: 0xFF62, // HALFWIDTH LEFT CORNER BRACKET
-	0xA3: 0xFF63, // HALFWIDTH RIGHT CORNER BRACKET
-	0xA4: 0xFF64, // HALFWIDTH IDEOGRAPHIC COMMA
-	0xA5: 0xFF65, // HALFWIDTH KATAKANA MIDDLE DOT
-	0xA6: 0xFF66, // HALFWIDTH KATAKANA LETTER WO
-	0xA7: 0xFF67, // HALFWIDTH KATAKANA LETTER SMALL A
-	0xA8: 0xFF68, // HALFWIDTH KATAKANA LETTER SMALL I
-	0xA9: 0xFF69, // HALFWIDTH KATAKANA LETTER SMALL U
-	0xAA: 0xFF6A, // HALFWIDTH KATAKANA LETTER SMALL E
-	0xAB: 0xFF6B, // HALFWIDTH KATAKANA LETTER SMALL O
-	0xAC: 0xFF6C, // HALFWIDTH KATAKANA LETTER SMALL YA
-	0xAD: 0xFF6D, // HALFWIDTH KATAKANA LETTER SMALL YU
-	0xAE: 0xFF6E, // HALFWIDTH KATAKANA LETTER SMALL YO
-	0xAF: 0xFF6F, // HALFWIDTH KATAKANA LETTER SMALL TU
-	0xB0: 0xFF70, // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
-	0xB1: 0xFF71, // HALFWIDTH KATAKANA LETTER A
-	0xB2: 0xFF72, // HALFWIDTH KATAKANA LETTER I
-	0xB3: 0xFF73, // HALFWIDTH KATAKANA LETTER U
-	0xB4: 0xFF74, // HALFWIDTH KATAKANA LETTER E
-	0xB5: 0xFF75, // HALFWIDTH KATAKANA LETTER O
-	0xB6: 0xFF76, // HALFWIDTH KATAKANA LETTER KA
-	0xB7: 0xFF77, // HALFWIDTH KATAKANA LETTER KI
-	0xB8: 0xFF78, // HALFWIDTH KATAKANA LETTER KU
-	0xB9: 0xFF79, // HALFWIDTH KATAKANA LETTER KE
-	0xBA: 0xFF7A, // HALFWIDTH KATAKANA LETTER KO
-	0xBB: 0xFF7B, // HALFWIDTH KATAKANA LETTER SA
-	0xBC: 0xFF7C, // HALFWIDTH KATAKANA LETTER SI
-	0xBD: 0xFF7D, // HALFWIDTH KATAKANA LETTER SU
-	0xBE: 0xFF7E, // HALFWIDTH KATAKANA LETTER SE
-	0xBF: 0xFF7F, // HALFWIDTH KATAKANA LETTER SO
-	0xC0: 0xFF80, // HALFWIDTH KATAKANA LETTER TA
-	0xC1: 0xFF81, // HALFWIDTH KATAKANA LETTER TI
-	0xC2: 0xFF82, // HALFWIDTH KATAKANA LETTER TU
-	0xC3: 0xFF83, // HALFWIDTH KATAKANA LETTER TE
-	0xC4: 0xFF84, // HALFWIDTH KATAKANA LETTER TO
-	0xC5: 0xFF85, // HALFWIDTH KATAKANA LETTER NA
-	0xC6: 0xFF86, // HALFWIDTH KATAKANA LETTER NI
-	0xC7: 0xFF87, // HALFWIDTH KATAKANA LETTER NU
-	0xC8: 0xFF88, // HALFWIDTH KATAKANA LETTER NE
-	0xC9: 0xFF89, // HALFWIDTH KATAKANA LETTER NO
-	0xCA: 0xFF8A, // HALFWIDTH KATAKANA LETTER HA
-	0xCB: 0xFF8B, // HALFWIDTH KATAKANA LETTER HI
-	0xCC: 0xFF8C, // HALFWIDTH KATAKANA LETTER HU
-	0xCD: 0xFF8D, // HALFWIDTH KATAKANA LETTER HE
-	0xCE: 0xFF8E, // HALFWIDTH KATAKANA LETTER HO
-	0xCF: 0xFF8F, // HALFWIDTH KATAKANA LETTER MA
-	0xD0: 0xFF90, // HALFWIDTH KATAKANA LETTER MI
-	0xD1: 0xFF91, // HALFWIDTH KATAKANA LETTER MU
-	0xD2: 0xFF92, // HALFWIDTH KATAKANA LETTER ME
-	0xD3: 0xFF93, // HALFWIDTH KATAKANA LETTER MO
-	0xD4: 0xFF94, // HALFWIDTH KATAKANA LETTER YA
-	0xD5: 0xFF95, // HALFWIDTH KATAKANA LETTER YU
-	0xD6: 0xFF96, // HALFWIDTH KATAKANA LETTER YO
-	0xD7: 0xFF97, // HALFWIDTH KATAKANA LETTER RA
-	0xD8: 0xFF98, // HALFWIDTH KATAKANA LETTER RI
-	0xD9: 0xFF99, // HALFWIDTH KATAKANA LETTER RU
-	0xDA: 0xFF9A, // HALFWIDTH KATAKANA LETTER RE
-	0xDB: 0xFF9B, // HALFWIDTH KATAKANA LETTER RO
-	0xDC: 0xFF9C, // HALFWIDTH KATAKANA LETTER WA
-	0xDD: 0xFF9D, // HALFWIDTH KATAKANA LETTER N
-	0xDE: 0xFF9E, // HALFWIDTH KATAKANA VOICED SOUND MARK
-	0xDF: 0xFF9F, // HALFWIDTH KATAKANA SEMI-VOICED SOUND MARK
-}

File diff suppressed because it is too large
+ 0 - 7039
modules/mahonia/jis0208-data.go


File diff suppressed because it is too large
+ 0 - 6209
modules/mahonia/jis0212-data.go


+ 0 - 88
modules/mahonia/kuten.go

@@ -1,88 +0,0 @@
-package mahonia
-
-import (
-	"sync"
-	"unicode/utf8"
-)
-
-// A kutenTable holds the data for a double-byte character set, arranged by ku
-// (区, zone) and ten (点, position). These can be converted to various actual
-// encoding schemes.
-type kutenTable struct {
-	// Data[ku][ten] is the unicode value for the character at that zone and
-	// position.
-	Data [94][94]uint16
-
-	// FromUnicode holds the ku and ten for each Unicode code point.
-	// It is not available until Reverse() has been called.
-	FromUnicode [][2]byte
-
-	// once is used to synchronize the generation of FromUnicode.
-	once sync.Once
-}
-
-// Reverse generates FromUnicode.
-func (t *kutenTable) Reverse() {
-	t.once.Do(func() {
-		t.FromUnicode = make([][2]byte, 65536)
-		for ku := range t.Data {
-			for ten, unicode := range t.Data[ku] {
-				t.FromUnicode[unicode] = [2]byte{byte(ku), byte(ten)}
-			}
-		}
-	})
-}
-
-// DecodeLow decodes a character from an encoding that does not have the high
-// bit set.
-func (t *kutenTable) DecodeLow(p []byte) (c rune, size int, status Status) {
-	if len(p) < 2 {
-		return 0, 0, NO_ROOM
-	}
-	ku := p[0] - 0x21
-	ten := p[1] - 0x21
-	if ku > 93 || ten > 93 {
-		return utf8.RuneError, 1, INVALID_CHAR
-	}
-	u := t.Data[ku][ten]
-	if u == 0 {
-		return utf8.RuneError, 1, INVALID_CHAR
-	}
-	return rune(u), 2, SUCCESS
-}
-
-// DecodeHigh decodes a character from an encoding that has the high bit set.
-func (t *kutenTable) DecodeHigh(p []byte) (c rune, size int, status Status) {
-	if len(p) < 2 {
-		return 0, 0, NO_ROOM
-	}
-	ku := p[0] - 0xa1
-	ten := p[1] - 0xa1
-	if ku > 93 || ten > 93 {
-		return utf8.RuneError, 1, INVALID_CHAR
-	}
-	u := t.Data[ku][ten]
-	if u == 0 {
-		return utf8.RuneError, 1, INVALID_CHAR
-	}
-	return rune(u), 2, SUCCESS
-}
-
-// EncodeHigh encodes a character in an encoding that has the high bit set.
-func (t *kutenTable) EncodeHigh(p []byte, c rune) (size int, status Status) {
-	if len(p) < 2 {
-		return 0, NO_ROOM
-	}
-	if c > 0xffff {
-		p[0] = '?'
-		return 1, INVALID_CHAR
-	}
-	kuten := t.FromUnicode[c]
-	if kuten == [2]byte{0, 0} && c != rune(t.Data[0][0]) {
-		p[0] = '?'
-		return 1, INVALID_CHAR
-	}
-	p[0] = kuten[0] + 0xa1
-	p[1] = kuten[1] + 0xa1
-	return 2, SUCCESS
-}

+ 0 - 229
modules/mahonia/mahonia_test.go

@@ -1,229 +0,0 @@
-package mahonia
-
-import (
-	"bytes"
-	"io/ioutil"
-	"testing"
-)
-
-var nameTests = map[string]string{
-	"utf8":       "utf8",
-	"ISO 8859-1": "iso88591",
-	"Big5":       "big5",
-	"":           "",
-}
-
-func TestSimplifyName(t *testing.T) {
-	for name, simple := range nameTests {
-		if simple != simplifyName(name) {
-			t.Errorf("%s came out as %s instead of as %s", name, simplifyName(name), simple)
-		}
-	}
-}
-
-var testData = []struct {
-	utf8, other, otherEncoding string
-}{
-	{"Résumé", "Résumé", "utf8"},
-	{"Résumé", "R\xe9sum\xe9", "latin-1"},
-	{"これは漢字です。", "S0\x8c0o0\"oW[g0Y0\x020", "UTF-16LE"},
-	{"これは漢字です。", "0S0\x8c0oo\"[W0g0Y0\x02", "UTF-16BE"},
-	{"これは漢字です。", "\xfe\xff0S0\x8c0oo\"[W0g0Y0\x02", "UTF-16"},
-	{"𝄢𝄞𝄪𝄫", "\xfe\xff\xd8\x34\xdd\x22\xd8\x34\xdd\x1e\xd8\x34\xdd\x2a\xd8\x34\xdd\x2b", "UTF-16"},
-	{"Hello, world", "Hello, world", "ASCII"},
-	{"Gdańsk", "Gda\xf1sk", "ISO-8859-2"},
-	{"Ââ Čč Đđ Ŋŋ Õõ Šš Žž Åå Ää", "\xc2\xe2 \xc8\xe8 \xa9\xb9 \xaf\xbf \xd5\xf5 \xaa\xba \xac\xbc \xc5\xe5 \xc4\xe4", "ISO-8859-10"},
-	{"สำหรับ", "\xca\xd3\xcb\xc3\u047a", "ISO-8859-11"},
-	{"latviešu", "latvie\xf0u", "ISO-8859-13"},
-	{"Seònaid", "Se\xf2naid", "ISO-8859-14"},
-	{"€1 is cheap", "\xa41 is cheap", "ISO-8859-15"},
-	{"românește", "rom\xe2ne\xbate", "ISO-8859-16"},
-	{"nutraĵo", "nutra\xbco", "ISO-8859-3"},
-	{"Kalâdlit", "Kal\xe2dlit", "ISO-8859-4"},
-	{"русский", "\xe0\xe3\xe1\xe1\xda\xd8\xd9", "ISO-8859-5"},
-	{"ελληνικά", "\xe5\xeb\xeb\xe7\xed\xe9\xea\xdc", "ISO-8859-7"},
-	{"Kağan", "Ka\xf0an", "ISO-8859-9"},
-	{"Résumé", "R\x8esum\x8e", "macintosh"},
-	{"Gdańsk", "Gda\xf1sk", "windows-1250"},
-	{"русский", "\xf0\xf3\xf1\xf1\xea\xe8\xe9", "windows-1251"},
-	{"Résumé", "R\xe9sum\xe9", "windows-1252"},
-	{"ελληνικά", "\xe5\xeb\xeb\xe7\xed\xe9\xea\xdc", "windows-1253"},
-	{"Kağan", "Ka\xf0an", "windows-1254"},
-	{"עִבְרִית", "\xf2\xc4\xe1\xc0\xf8\xc4\xe9\xfa", "windows-1255"},
-	{"العربية", "\xc7\xe1\xda\xd1\xc8\xed\xc9", "windows-1256"},
-	{"latviešu", "latvie\xf0u", "windows-1257"},
-	{"Việt", "Vi\xea\xf2t", "windows-1258"},
-	{"สำหรับ", "\xca\xd3\xcb\xc3\u047a", "windows-874"},
-	{"русский", "\xd2\xd5\xd3\xd3\xcb\xc9\xca", "KOI8-R"},
-	{"українська", "\xd5\xcb\xd2\xc1\xa7\xce\xd3\xd8\xcb\xc1", "KOI8-U"},
-	{"Hello 常用國字標準字體表", "Hello \xb1`\xa5\u03b0\xea\xa6r\xbc\u0437\u01e6r\xc5\xe9\xaa\xed", "big5"},
-	{"Hello 常用國字標準字體表", "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed", "gbk"},
-	{"Hello 常用國字標準字體表", "Hello \xb3\xa3\xd3\xc3\x87\xf8\xd7\xd6\x98\xcb\x9c\xca\xd7\xd6\xf3\x77\xb1\xed", "gb18030"},
-	{"עִבְרִית", "\x81\x30\xfb\x30\x81\x30\xf6\x34\x81\x30\xf9\x33\x81\x30\xf6\x30\x81\x30\xfb\x36\x81\x30\xf6\x34\x81\x30\xfa\x31\x81\x30\xfb\x38", "gb18030"},
-	{"㧯", "\x82\x31\x89\x38", "gb18030"},
-	{"これは漢字です。", "\x82\xb1\x82\xea\x82\xcd\x8a\xbf\x8e\x9a\x82\xc5\x82\xb7\x81B", "SJIS"},
-	{"Hello, 世界!", "Hello, \x90\xa2\x8aE!", "SJIS"},
-	{"イウエオカ", "\xb2\xb3\xb4\xb5\xb6", "SJIS"},
-	{"これは漢字です。", "\xa4\xb3\xa4\xec\xa4\u03f4\xc1\xbb\xfa\xa4\u01e4\xb9\xa1\xa3", "EUC-JP"},
-	{"これは漢字です。", "\xa4\xb3\xa4\xec\xa4\u03f4\xc1\xbb\xfa\xa4\u01e4\xb9\xa1\xa3", "CP51932"},
-	{"Thông tin bạn đồng hànhỌ", "Th\xabng tin b\xb9n \xae\xe5ng h\xb5nhO\xe4", "TCVN3"},
-	{"Hello, 世界!", "Hello, \x1b$B@$3&\x1b(B!", "ISO-2022-JP"},
-	{"네이트 | 즐거움의 시작, 슈파스(Spaβ) NATE", "\xb3\xd7\xc0\xcc\xc6\xae | \xc1\xf1\xb0\xc5\xbf\xf2\xc0\xc7 \xbd\xc3\xc0\xdb, \xbd\xb4\xc6\xc4\xbd\xba(Spa\xa5\xe2) NATE", "EUC-KR"},
-}
-
-func TestDecode(t *testing.T) {
-	for _, data := range testData {
-		d := NewDecoder(data.otherEncoding)
-		if d == nil {
-			t.Errorf("Could not create decoder for %s", data.otherEncoding)
-			continue
-		}
-
-		str := d.ConvertString(data.other)
-
-		if str != data.utf8 {
-			t.Errorf("Unexpected value: %#v (expected %#v)", str, data.utf8)
-		}
-	}
-}
-
-func TestDecodeTranslate(t *testing.T) {
-	for _, data := range testData {
-		d := NewDecoder(data.otherEncoding)
-		if d == nil {
-			t.Errorf("Could not create decoder for %s", data.otherEncoding)
-			continue
-		}
-
-		_, cdata, _ := d.Translate([]byte(data.other), true)
-		str := string(cdata)
-
-		if str != data.utf8 {
-			t.Errorf("Unexpected value: %#v (expected %#v)", str, data.utf8)
-		}
-	}
-}
-
-func TestEncode(t *testing.T) {
-	for _, data := range testData {
-		e := NewEncoder(data.otherEncoding)
-		if e == nil {
-			t.Errorf("Could not create encoder for %s", data.otherEncoding)
-			continue
-		}
-
-		str := e.ConvertString(data.utf8)
-
-		if str != data.other {
-			t.Errorf("Unexpected value: %#v (expected %#v)", str, data.other)
-		}
-	}
-}
-
-func TestReader(t *testing.T) {
-	for _, data := range testData {
-		d := NewDecoder(data.otherEncoding)
-		if d == nil {
-			t.Errorf("Could not create decoder for %s", data.otherEncoding)
-			continue
-		}
-
-		b := bytes.NewBufferString(data.other)
-		r := d.NewReader(b)
-		result, _ := ioutil.ReadAll(r)
-		str := string(result)
-
-		if str != data.utf8 {
-			t.Errorf("Unexpected value: %#v (expected %#v)", str, data.utf8)
-		}
-	}
-}
-
-func TestWriter(t *testing.T) {
-	for _, data := range testData {
-		e := NewEncoder(data.otherEncoding)
-		if e == nil {
-			t.Errorf("Could not create encoder for %s", data.otherEncoding)
-			continue
-		}
-
-		b := new(bytes.Buffer)
-		w := e.NewWriter(b)
-		w.Write([]byte(data.utf8))
-		str := b.String()
-
-		if str != data.other {
-			t.Errorf("Unexpected value: %#v (expected %#v)", str, data.other)
-		}
-	}
-}
-
-func TestFallback(t *testing.T) {
-	mixed := "résum\xe9 " // The space is needed because of the issue mentioned in the Note: in fallback.go
-	pure := "résumé "
-	d := FallbackDecoder(NewDecoder("utf8"), NewDecoder("ISO-8859-1"))
-	result := d.ConvertString(mixed)
-	if result != pure {
-		t.Errorf("Unexpected value: %#v (expected %#v)", result, pure)
-	}
-}
-
-func TestEntities(t *testing.T) {
-	escaped := "&notit; I'm &notin; I tell you&#X82&#32;&nLt; "
-	plain := "¬it; I'm ∉ I tell you\u201a \u226A\u20D2 "
-	d := FallbackDecoder(EntityDecoder(), NewDecoder("ISO-8859-1"))
-	result := d.ConvertString(escaped)
-	if result != plain {
-		t.Errorf("Unexpected value: %#v (expected %#v)", result, plain)
-	}
-}
-
-func TestConvertStringOK(t *testing.T) {
-	d := NewDecoder("ASCII")
-	if d == nil {
-		t.Fatal("Could not create decoder for ASCII")
-	}
-
-	str, ok := d.ConvertStringOK("hello")
-	if !ok {
-		t.Error("Spurious error found while decoding")
-	}
-	if str != "hello" {
-		t.Errorf("expected %#v, got %#v", "hello", str)
-	}
-
-	str, ok = d.ConvertStringOK("\x80")
-	if ok {
-		t.Error(`Failed to detect error decoding "\x80"`)
-	}
-
-	e := NewEncoder("ISO-8859-3")
-	if e == nil {
-		t.Fatal("Could not create encoder for ISO-8859-1")
-	}
-
-	str, ok = e.ConvertStringOK("nutraĵo")
-	if !ok {
-		t.Error("spurious error while encoding")
-	}
-	if str != "nutra\xbco" {
-		t.Errorf("expected %#v, got %#v", "nutra\xbco", str)
-	}
-
-	str, ok = e.ConvertStringOK("\x80abc")
-	if ok {
-		t.Error("failed to detect invalid UTF-8 while encoding")
-	}
-
-	str, ok = e.ConvertStringOK("русский")
-	if ok {
-		t.Error("failed to detect characters that couldn't be encoded")
-	}
-}
-
-func TestBadCharset(t *testing.T) {
-	d := NewDecoder("this is not a valid charset")
-	if d != nil {
-		t.Fatal("got a non-nil decoder for an invalid charset")
-	}
-}

+ 0 - 40
modules/mahonia/mahoniconv/mahoniconv.go

@@ -1,40 +0,0 @@
-package main
-
-import (
-	"flag"
-	"io"
-	"log"
-	"os"
-
-	"github.com/gogits/gogs/modules/mahonia"
-)
-
-// An iconv workalike using mahonia.
-
-var from = flag.String("f", "utf-8", "source character set")
-var to = flag.String("t", "utf-8", "destination character set")
-
-func main() {
-	flag.Parse()
-
-	var r io.Reader = os.Stdin
-	var w io.Writer = os.Stdout
-
-	if *from != "utf-8" {
-		decode := mahonia.NewDecoder(*from)
-		if decode == nil {
-			log.Fatalf("Could not create decoder for %s", *from)
-		}
-		r = decode.NewReader(r)
-	}
-
-	if *to != "utf-8" {
-		encode := mahonia.NewEncoder(*to)
-		if encode == nil {
-			log.Fatalf("Could not create decoder for %s", *to)
-		}
-		w = encode.NewWriter(w)
-	}
-
-	io.Copy(w, r)
-}

+ 0 - 92
modules/mahonia/mbcs.go

@@ -1,92 +0,0 @@
-package mahonia
-
-// Generic converters for multibyte character sets.
-
-// An mbcsTrie contains the data to convert from the character set to Unicode.
-// If a character would be encoded as "\x01\x02\x03", its unicode value would be found at t.children[1].children[2].children[3].rune
-// children either is nil or has 256 elements.
-type mbcsTrie struct {
-	// For leaf nodes, the Unicode character that is represented.
-	char rune
-
-	// For non-leaf nodes, the trie to decode the remainder of the character.
-	children []mbcsTrie
-}
-
-// A MBCSTable holds the data to convert to and from Unicode.
-type MBCSTable struct {
-	toUnicode   mbcsTrie
-	fromUnicode map[rune]string
-}
-
-// AddCharacter adds a character to the table. rune is its Unicode code point,
-// and bytes contains the bytes used to encode it in the character set.
-func (table *MBCSTable) AddCharacter(c rune, bytes string) {
-	if table.fromUnicode == nil {
-		table.fromUnicode = make(map[rune]string)
-	}
-
-	table.fromUnicode[c] = bytes
-
-	trie := &table.toUnicode
-	for i := 0; i < len(bytes); i++ {
-		if trie.children == nil {
-			trie.children = make([]mbcsTrie, 256)
-		}
-
-		b := bytes[i]
-		trie = &trie.children[b]
-	}
-
-	trie.char = c
-}
-
-func (table *MBCSTable) Decoder() Decoder {
-	return func(p []byte) (c rune, size int, status Status) {
-		if len(p) == 0 {
-			status = NO_ROOM
-			return
-		}
-
-		if p[0] == 0 {
-			return 0, 1, SUCCESS
-		}
-
-		trie := &table.toUnicode
-		for trie.char == 0 {
-			if trie.children == nil {
-				return 0xfffd, 1, INVALID_CHAR
-			}
-			if len(p) < size+1 {
-				return 0, 0, NO_ROOM
-			}
-
-			trie = &trie.children[p[size]]
-			size++
-		}
-
-		c = trie.char
-		status = SUCCESS
-		return
-	}
-}
-
-func (table *MBCSTable) Encoder() Encoder {
-	return func(p []byte, c rune) (size int, status Status) {
-		bytes := table.fromUnicode[c]
-		if bytes == "" {
-			if len(p) > 0 {
-				p[0] = '?'
-				return 1, INVALID_CHAR
-			} else {
-				return 0, NO_ROOM
-			}
-		}
-
-		if len(p) < len(bytes) {
-			return 0, NO_ROOM
-		}
-
-		return copy(p, bytes), SUCCESS
-	}
-}

File diff suppressed because it is too large
+ 0 - 7497
modules/mahonia/ms-jis-data.go


+ 0 - 151
modules/mahonia/reader.go

@@ -1,151 +0,0 @@
-package mahonia
-
-// This file is based on bufio.Reader in the Go standard library,
-// which has the following copyright notice:
-
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-import (
-	"io"
-	"unicode/utf8"
-)
-
-const (
-	defaultBufSize = 4096
-)
-
-// Reader implements character-set decoding for an io.Reader object.
-type Reader struct {
-	buf    []byte
-	rd     io.Reader
-	decode Decoder
-	r, w   int
-	err    error
-}
-
-// NewReader creates a new Reader that uses the receiver to decode text.
-func (d Decoder) NewReader(rd io.Reader) *Reader {
-	b := new(Reader)
-	b.buf = make([]byte, defaultBufSize)
-	b.rd = rd
-	b.decode = d
-	return b
-}
-
-// fill reads a new chunk into the buffer.
-func (b *Reader) fill() {
-	// Slide existing data to beginning.
-	if b.r > 0 {
-		copy(b.buf, b.buf[b.r:b.w])
-		b.w -= b.r
-		b.r = 0
-	}
-
-	// Read new data.
-	n, e := b.rd.Read(b.buf[b.w:])
-	b.w += n
-	if e != nil {
-		b.err = e
-	}
-}
-
-// Read reads data into p.
-// It returns the number of bytes read into p.
-// It calls Read at most once on the underlying Reader,
-// hence n may be less than len(p).
-// At EOF, the count will be zero and err will be os.EOF.
-func (b *Reader) Read(p []byte) (n int, err error) {
-	n = len(p)
-	filled := false
-	if n == 0 {
-		return 0, b.err
-	}
-	if b.w == b.r {
-		if b.err != nil {
-			return 0, b.err
-		}
-		if n > len(b.buf) {
-			// Large read, empty buffer.
-			// Allocate a larger buffer for efficiency.
-			b.buf = make([]byte, n)
-		}
-		b.fill()
-		filled = true
-		if b.w == b.r {
-			return 0, b.err
-		}
-	}
-
-	i := 0
-	for i < n {
-		rune, size, status := b.decode(b.buf[b.r:b.w])
-
-		if status == STATE_ONLY {
-			b.r += size
-			continue
-		}
-
-		if status == NO_ROOM {
-			if b.err != nil {
-				rune = 0xfffd
-				size = b.w - b.r
-				if size == 0 {
-					break
-				}
-				status = INVALID_CHAR
-			} else if filled {
-				break
-			} else {
-				b.fill()
-				filled = true
-				continue
-			}
-		}
-
-		if i+utf8.RuneLen(rune) > n {
-			break
-		}
-
-		b.r += size
-		if rune < 128 {
-			p[i] = byte(rune)
-			i++
-		} else {
-			i += utf8.EncodeRune(p[i:], rune)
-		}
-	}
-
-	return i, nil
-}
-
-// ReadRune reads a single Unicode character and returns the
-// rune and its size in bytes.
-func (b *Reader) ReadRune() (c rune, size int, err error) {
-read:
-	c, size, status := b.decode(b.buf[b.r:b.w])
-
-	if status == NO_ROOM && b.err == nil {
-		b.fill()
-		goto read
-	}
-
-	if status == STATE_ONLY {
-		b.r += size
-		goto read
-	}
-
-	if b.r == b.w {
-		return 0, 0, b.err
-	}
-
-	if status == NO_ROOM {
-		c = 0xfffd
-		size = b.w - b.r
-		status = INVALID_CHAR
-	}
-
-	b.r += size
-	return c, size, nil
-}

File diff suppressed because it is too large
+ 0 - 7748
modules/mahonia/shiftjis-data.go


+ 0 - 88
modules/mahonia/shiftjis.go

@@ -1,88 +0,0 @@
-package mahonia
-
-// Converters for the Shift-JIS encoding.
-
-import (
-	"unicode/utf8"
-)
-
-func init() {
-	RegisterCharset(&Charset{
-		Name:    "Shift_JIS",
-		Aliases: []string{"MS_Kanji", "csShiftJIS", "SJIS", "ibm-943", "windows-31j", "cp932", "windows-932"},
-		NewDecoder: func() Decoder {
-			return decodeSJIS
-		},
-		NewEncoder: func() Encoder {
-			shiftJISOnce.Do(reverseShiftJISTable)
-			return encodeSJIS
-		},
-	})
-}
-
-func decodeSJIS(p []byte) (c rune, size int, status Status) {
-	if len(p) == 0 {
-		return 0, 0, NO_ROOM
-	}
-
-	b := p[0]
-	if b < 0x80 {
-		return rune(b), 1, SUCCESS
-	}
-
-	if 0xa1 <= b && b <= 0xdf {
-		return rune(b) + (0xff61 - 0xa1), 1, SUCCESS
-	}
-
-	if b == 0x80 || b == 0xa0 {
-		return utf8.RuneError, 1, INVALID_CHAR
-	}
-
-	if len(p) < 2 {
-		return 0, 0, NO_ROOM
-	}
-
-	jis := int(b)<<8 + int(p[1])
-	c = rune(shiftJISToUnicode[jis])
-
-	if c == 0 {
-		return utf8.RuneError, 2, INVALID_CHAR
-	}
-	return c, 2, SUCCESS
-}
-
-func encodeSJIS(p []byte, c rune) (size int, status Status) {
-	if len(p) == 0 {
-		return 0, NO_ROOM
-	}
-
-	if c < 0x80 {
-		p[0] = byte(c)
-		return 1, SUCCESS
-	}
-
-	if 0xff61 <= c && c <= 0xff9f {
-		// half-width katakana
-		p[0] = byte(c - (0xff61 - 0xa1))
-		return 1, SUCCESS
-	}
-
-	if len(p) < 2 {
-		return 0, NO_ROOM
-	}
-
-	if c > 0xffff {
-		p[0] = '?'
-		return 1, INVALID_CHAR
-	}
-
-	jis := unicodeToShiftJIS[c]
-	if jis == 0 {
-		p[0] = '?'
-		return 1, INVALID_CHAR
-	}
-
-	p[0] = byte(jis >> 8)
-	p[1] = byte(jis)
-	return 2, SUCCESS
-}

+ 0 - 228
modules/mahonia/tcvn3.go

@@ -1,228 +0,0 @@
-package mahonia
-
-// Converters for TCVN3 encoding.
-
-import (
-	"sync"
-)
-
-var (
-	onceTCVN3 sync.Once
-	dataTCVN3 = struct {
-		UnicodeToWord map[rune][2]byte
-		WordToUnicode [256]struct {
-			r rune
-			m *[256]rune
-		}
-	}{}
-)
-
-func init() {
-	p := new(Charset)
-	p.Name = "TCVN3"
-	p.NewDecoder = func() Decoder {
-		onceTCVN3.Do(buildTCVN3Tables)
-		return decodeTCVN3
-	}
-	p.NewEncoder = func() Encoder {
-		onceTCVN3.Do(buildTCVN3Tables)
-		return encodeTCVN3
-	}
-	RegisterCharset(p)
-}
-
-func decodeTCVN3(p []byte) (rune, int, Status) {
-	if len(p) == 0 {
-		return 0, 0, NO_ROOM
-	}
-	item := &dataTCVN3.WordToUnicode[p[0]]
-	if item.m != nil && len(p) > 1 {
-		if r := item.m[p[1]]; r != 0 {
-			return r, 2, SUCCESS
-		}
-	}
-	if item.r != 0 {
-		return item.r, 1, SUCCESS
-	}
-	if p[0] < 0x80 {
-		return rune(p[0]), 1, SUCCESS
-	}
-	return '?', 1, INVALID_CHAR
-}
-
-func encodeTCVN3(p []byte, c rune) (int, Status) {
-	if len(p) == 0 {
-		return 0, NO_ROOM
-	}
-	if c < rune(0x80) {
-		p[0] = byte(c)
-		return 1, SUCCESS
-	}
-	if v, ok := dataTCVN3.UnicodeToWord[c]; ok {
-		if v[1] != 0 {
-			if len(p) < 2 {
-				return 0, NO_ROOM
-			}
-			p[0] = v[0]
-			p[1] = v[1]
-			return 2, SUCCESS
-		} else {
-			p[0] = v[0]
-			return 1, SUCCESS
-		}
-	}
-	p[0] = '?'
-	return 1, INVALID_CHAR
-}
-
-func buildTCVN3Tables() {
-	dataTCVN3.UnicodeToWord = map[rune][2]byte{
-		// one byte
-		0x00C2: {0xA2, 0x00},
-		0x00CA: {0xA3, 0x00},
-		0x00D4: {0xA4, 0x00},
-		0x00E0: {0xB5, 0x00},
-		0x00E1: {0xB8, 0x00},
-		0x00E2: {0xA9, 0x00},
-		0x00E3: {0xB7, 0x00},
-		0x00E8: {0xCC, 0x00},
-		0x00E9: {0xD0, 0x00},
-		0x00EA: {0xAA, 0x00},
-		0x00EC: {0xD7, 0x00},
-		0x00ED: {0xDD, 0x00},
-		0x00F2: {0xDF, 0x00},
-		0x00F3: {0xE3, 0x00},
-		0x00F4: {0xAB, 0x00},
-		0x00F5: {0xE2, 0x00},
-		0x00F9: {0xEF, 0x00},
-		0x00FA: {0xF3, 0x00},
-		0x00FD: {0xFD, 0x00},
-		0x0102: {0xA1, 0x00},
-		0x0103: {0xA8, 0x00},
-		0x0110: {0xA7, 0x00},
-		0x0111: {0xAE, 0x00},
-		0x0129: {0xDC, 0x00},
-		0x0169: {0xF2, 0x00},
-		0x01A0: {0xA5, 0x00},
-		0x01A1: {0xAC, 0x00},
-		0x01AF: {0xA6, 0x00},
-		0x01B0: {0xAD, 0x00},
-		0x1EA1: {0xB9, 0x00},
-		0x1EA3: {0xB6, 0x00},
-		0x1EA5: {0xCA, 0x00},
-		0x1EA7: {0xC7, 0x00},
-		0x1EA9: {0xC8, 0x00},
-		0x1EAB: {0xC9, 0x00},
-		0x1EAD: {0xCB, 0x00},
-		0x1EAF: {0xBE, 0x00},
-		0x1EB1: {0xBB, 0x00},
-		0x1EB3: {0xBC, 0x00},
-		0x1EB5: {0xBD, 0x00},
-		0x1EB7: {0xC6, 0x00},
-		0x1EB9: {0xD1, 0x00},
-		0x1EBB: {0xCE, 0x00},
-		0x1EBD: {0xCF, 0x00},
-		0x1EBF: {0xD5, 0x00},
-		0x1EC1: {0xD2, 0x00},
-		0x1EC3: {0xD3, 0x00},
-		0x1EC5: {0xD4, 0x00},
-		0x1EC7: {0xD6, 0x00},
-		0x1EC9: {0xD8, 0x00},
-		0x1ECB: {0xDE, 0x00},
-		0x1ECD: {0xE4, 0x00},
-		0x1ECF: {0xE1, 0x00},
-		0x1ED1: {0xE8, 0x00},
-		0x1ED3: {0xE5, 0x00},
-		0x1ED5: {0xE6, 0x00},
-		0x1ED7: {0xE7, 0x00},
-		0x1ED9: {0xE9, 0x00},
-		0x1EDB: {0xED, 0x00},
-		0x1EDD: {0xEA, 0x00},
-		0x1EDF: {0xEB, 0x00},
-		0x1EE1: {0xEC, 0x00},
-		0x1EE3: {0xEE, 0x00},
-		0x1EE5: {0xF4, 0x00},
-		0x1EE7: {0xF1, 0x00},
-		0x1EE9: {0xF8, 0x00},
-		0x1EEB: {0xF5, 0x00},
-		0x1EED: {0xF6, 0x00},
-		0x1EEF: {0xF7, 0x00},
-		0x1EF1: {0xF9, 0x00},
-		0x1EF3: {0xFA, 0x00},
-		0x1EF5: {0xFE, 0x00},
-		0x1EF7: {0xFB, 0x00},
-		0x1EF9: {0xFC, 0x00},
-		// two bytes
-		0x00C0: {0x41, 0xB5},
-		0x00C1: {0x41, 0xB8},
-		0x00C3: {0x41, 0xB7},
-		0x00C8: {0x45, 0xCC},
-		0x00C9: {0x45, 0xD0},
-		0x00CC: {0x49, 0xD7},
-		0x00CD: {0x49, 0xDD},
-		0x00D2: {0x4F, 0xDF},
-		0x00D3: {0x4F, 0xE3},
-		0x00D5: {0x4F, 0xE2},
-		0x00D9: {0x55, 0xEF},
-		0x00DA: {0x55, 0xF3},
-		0x00DD: {0x59, 0xFD},
-		0x0128: {0x49, 0xDC},
-		0x0168: {0x55, 0xF2},
-		0x1EA0: {0x41, 0xB9},
-		0x1EA2: {0x41, 0xB6},
-		0x1EA4: {0xA2, 0xCA},
-		0x1EA6: {0xA2, 0xC7},
-		0x1EA8: {0xA2, 0xC8},
-		0x1EAA: {0xA2, 0xC9},
-		0x1EAC: {0xA2, 0xCB},
-		0x1EAE: {0xA1, 0xBE},
-		0x1EB0: {0xA1, 0xBB},
-		0x1EB2: {0xA1, 0xBC},
-		0x1EB4: {0xA1, 0xBD},
-		0x1EB6: {0xA1, 0xC6},
-		0x1EB8: {0x45, 0xD1},
-		0x1EBA: {0x45, 0xCE},
-		0x1EBC: {0x45, 0xCF},
-		0x1EBE: {0xA3, 0xD5},
-		0x1EC0: {0xA3, 0xD2},
-		0x1EC2: {0xA3, 0xD3},
-		0x1EC4: {0xA3, 0xD4},
-		0x1EC6: {0xA3, 0xD6},
-		0x1EC8: {0x49, 0xD8},
-		0x1ECA: {0x49, 0xDE},
-		0x1ECC: {0x4F, 0xE4},
-		0x1ECE: {0x4F, 0xE1},
-		0x1ED0: {0xA4, 0xE8},
-		0x1ED2: {0xA4, 0xE5},
-		0x1ED4: {0xA4, 0xE6},
-		0x1ED6: {0xA4, 0xE7},
-		0x1ED8: {0xA4, 0xE9},
-		0x1EDA: {0xA5, 0xED},
-		0x1EDC: {0xA5, 0xEA},
-		0x1EDE: {0xA5, 0xEB},
-		0x1EE0: {0xA5, 0xEC},
-		0x1EE2: {0xA5, 0xEE},
-		0x1EE4: {0x55, 0xF4},
-		0x1EE6: {0x55, 0xF1},
-		0x1EE8: {0xA6, 0xF8},
-		0x1EEA: {0xA6, 0xF5},
-		0x1EEC: {0xA6, 0xF6},
-		0x1EEE: {0xA6, 0xF7},
-		0x1EF0: {0xA6, 0xF9},
-		0x1EF2: {0x59, 0xFA},
-		0x1EF4: {0x59, 0xFE},
-		0x1EF6: {0x59, 0xFB},
-		0x1EF8: {0x59, 0xFC},
-	}
-	for r, b := range dataTCVN3.UnicodeToWord {
-		item := &dataTCVN3.WordToUnicode[b[0]]
-		if b[1] == 0 {
-			item.r = r
-		} else {
-			if item.m == nil {
-				item.m = new([256]rune)
-			}
-			item.m[b[1]] = r
-		}
-	}
-}

+ 0 - 50
modules/mahonia/translate.go

@@ -1,50 +0,0 @@
-package mahonia
-
-import "unicode/utf8"
-
-// Translate enables a Decoder to implement go-charset's Translator interface.
-func (d Decoder) Translate(data []byte, eof bool) (n int, cdata []byte, err error) {
-	cdata = make([]byte, len(data)+1)
-	destPos := 0
-
-	for n < len(data) {
-		rune, size, status := d(data[n:])
-
-		switch status {
-		case STATE_ONLY:
-			n += size
-			continue
-
-		case NO_ROOM:
-			if !eof {
-				return n, cdata[:destPos], nil
-			}
-			rune = 0xfffd
-			n = len(data)
-
-		default:
-			n += size
-		}
-
-		if rune < 128 {
-			if destPos >= len(cdata) {
-				cdata = doubleLength(cdata)
-			}
-			cdata[destPos] = byte(rune)
-			destPos++
-		} else {
-			if destPos+utf8.RuneLen(rune) > len(cdata) {
-				cdata = doubleLength(cdata)
-			}
-			destPos += utf8.EncodeRune(cdata[destPos:], rune)
-		}
-	}
-
-	return n, cdata[:destPos], nil
-}
-
-func doubleLength(b []byte) []byte {
-	b2 := make([]byte, 2*len(b))
-	copy(b2, b)
-	return b2
-}

+ 0 - 170
modules/mahonia/utf16.go

@@ -1,170 +0,0 @@
-package mahonia
-
-import (
-	"unicode/utf16"
-)
-
-func init() {
-	for i := 0; i < len(utf16Charsets); i++ {
-		RegisterCharset(&utf16Charsets[i])
-	}
-}
-
-var utf16Charsets = []Charset{
-	{
-		Name: "UTF-16",
-		NewDecoder: func() Decoder {
-			var decodeRune Decoder
-			return func(p []byte) (c rune, size int, status Status) {
-				if decodeRune == nil {
-					// haven't read the BOM yet
-					if len(p) < 2 {
-						status = NO_ROOM
-						return
-					}
-
-					switch {
-					case p[0] == 0xfe && p[1] == 0xff:
-						decodeRune = decodeUTF16beRune
-						return 0, 2, STATE_ONLY
-					case p[0] == 0xff && p[1] == 0xfe:
-						decodeRune = decodeUTF16leRune
-						return 0, 2, STATE_ONLY
-					default:
-						decodeRune = decodeUTF16beRune
-					}
-				}
-
-				return decodeRune(p)
-			}
-		},
-		NewEncoder: func() Encoder {
-			wroteBOM := false
-			return func(p []byte, c rune) (size int, status Status) {
-				if !wroteBOM {
-					if len(p) < 2 {
-						status = NO_ROOM
-						return
-					}
-
-					p[0] = 0xfe
-					p[1] = 0xff
-					wroteBOM = true
-					return 2, STATE_ONLY
-				}
-
-				return encodeUTF16beRune(p, c)
-			}
-		},
-	},
-	{
-		Name:       "UTF-16BE",
-		NewDecoder: func() Decoder { return decodeUTF16beRune },
-		NewEncoder: func() Encoder { return encodeUTF16beRune },
-	},
-	{
-		Name:       "UTF-16LE",
-		NewDecoder: func() Decoder { return decodeUTF16leRune },
-		NewEncoder: func() Encoder { return encodeUTF16leRune },
-	},
-}
-
-func decodeUTF16beRune(p []byte) (r rune, size int, status Status) {
-	if len(p) < 2 {
-		status = NO_ROOM
-		return
-	}
-
-	c := rune(p[0])<<8 + rune(p[1])
-
-	if utf16.IsSurrogate(c) {
-		if len(p) < 4 {
-			status = NO_ROOM
-			return
-		}
-
-		c2 := rune(p[2])<<8 + rune(p[3])
-		c = utf16.DecodeRune(c, c2)
-
-		if c == 0xfffd {
-			return c, 2, INVALID_CHAR
-		} else {
-			return c, 4, SUCCESS
-		}
-	}
-
-	return c, 2, SUCCESS
-}
-
-func encodeUTF16beRune(p []byte, c rune) (size int, status Status) {
-	if c < 0x10000 {
-		if len(p) < 2 {
-			status = NO_ROOM
-			return
-		}
-		p[0] = byte(c >> 8)
-		p[1] = byte(c)
-		return 2, SUCCESS
-	}
-
-	if len(p) < 4 {
-		status = NO_ROOM
-		return
-	}
-	s1, s2 := utf16.EncodeRune(c)
-	p[0] = byte(s1 >> 8)
-	p[1] = byte(s1)
-	p[2] = byte(s2 >> 8)
-	p[3] = byte(s2)
-	return 4, SUCCESS
-}
-
-func decodeUTF16leRune(p []byte) (r rune, size int, status Status) {
-	if len(p) < 2 {
-		status = NO_ROOM
-		return
-	}
-
-	c := rune(p[1])<<8 + rune(p[0])
-
-	if utf16.IsSurrogate(c) {
-		if len(p) < 4 {
-			status = NO_ROOM
-			return
-		}
-
-		c2 := rune(p[3])<<8 + rune(p[2])
-		c = utf16.DecodeRune(c, c2)
-
-		if c == 0xfffd {
-			return c, 2, INVALID_CHAR
-		} else {
-			return c, 4, SUCCESS
-		}
-	}
-
-	return c, 2, SUCCESS
-}
-
-func encodeUTF16leRune(p []byte, c rune) (size int, status Status) {
-	if c < 0x10000 {
-		if len(p) < 2 {
-			status = NO_ROOM
-			return
-		}
-		p[1] = byte(c >> 8)
-		p[0] = byte(c)
-		return 2, SUCCESS
-	}
-
-	if len(p) < 4 {
-		status = NO_ROOM
-		return
-	}
-	s1, s2 := utf16.EncodeRune(c)
-	p[1] = byte(s1 >> 8)
-	p[0] = byte(s1)
-	p[3] = byte(s2 >> 8)
-	p[2] = byte(s2)
-	return 4, SUCCESS
-}

+ 0 - 45
modules/mahonia/utf8.go

@@ -1,45 +0,0 @@
-package mahonia
-
-import "unicode/utf8"
-
-func init() {
-	RegisterCharset(&Charset{
-		Name:       "UTF-8",
-		NewDecoder: func() Decoder { return decodeUTF8Rune },
-		NewEncoder: func() Encoder { return encodeUTF8Rune },
-	})
-}
-
-func decodeUTF8Rune(p []byte) (c rune, size int, status Status) {
-	if len(p) == 0 {
-		status = NO_ROOM
-		return
-	}
-
-	if p[0] < 128 {
-		return rune(p[0]), 1, SUCCESS
-	}
-
-	c, size = utf8.DecodeRune(p)
-
-	if c == 0xfffd {
-		if utf8.FullRune(p) {
-			status = INVALID_CHAR
-			return
-		}
-
-		return 0, 0, NO_ROOM
-	}
-
-	status = SUCCESS
-	return
-}
-
-func encodeUTF8Rune(p []byte, c rune) (size int, status Status) {
-	size = utf8.RuneLen(c)
-	if size > len(p) {
-		return 0, NO_ROOM
-	}
-
-	return utf8.EncodeRune(p, c), SUCCESS
-}

+ 0 - 108
modules/mahonia/writer.go

@@ -1,108 +0,0 @@
-package mahonia
-
-import (
-	"io"
-	"unicode/utf8"
-)
-
-// Writer implements character-set encoding for an io.Writer object.
-type Writer struct {
-	wr     io.Writer
-	encode Encoder
-	inbuf  []byte
-	outbuf []byte
-}
-
-// NewWriter creates a new Writer that uses the receiver to encode text.
-func (e Encoder) NewWriter(wr io.Writer) *Writer {
-	w := new(Writer)
-	w.wr = wr
-	w.encode = e
-	return w
-}
-
-// Write encodes and writes the data from p.
-func (w *Writer) Write(p []byte) (n int, err error) {
-	n = len(p)
-
-	if len(w.inbuf) > 0 {
-		w.inbuf = append(w.inbuf, p...)
-		p = w.inbuf
-	}
-
-	if len(w.outbuf) < len(p) {
-		w.outbuf = make([]byte, len(p)+10)
-	}
-
-	outpos := 0
-
-	for len(p) > 0 {
-		rune, size := utf8.DecodeRune(p)
-		if rune == 0xfffd && !utf8.FullRune(p) {
-			break
-		}
-
-		p = p[size:]
-
-	retry:
-		size, status := w.encode(w.outbuf[outpos:], rune)
-
-		if status == NO_ROOM {
-			newDest := make([]byte, len(w.outbuf)*2)
-			copy(newDest, w.outbuf)
-			w.outbuf = newDest
-			goto retry
-		}
-
-		if status == STATE_ONLY {
-			outpos += size
-			goto retry
-		}
-
-		outpos += size
-	}
-
-	w.inbuf = w.inbuf[:0]
-	if len(p) > 0 {
-		w.inbuf = append(w.inbuf, p...)
-	}
-
-	n1, err := w.wr.Write(w.outbuf[0:outpos])
-
-	if err != nil && n1 < n {
-		n = n1
-	}
-
-	return
-}
-
-func (w *Writer) WriteRune(c rune) (size int, err error) {
-	if len(w.inbuf) > 0 {
-		// There are leftover bytes, a partial UTF-8 sequence.
-		w.inbuf = w.inbuf[:0]
-		w.WriteRune(0xfffd)
-	}
-
-	if w.outbuf == nil {
-		w.outbuf = make([]byte, 16)
-	}
-
-	outpos := 0
-
-retry:
-	size, status := w.encode(w.outbuf[outpos:], c)
-
-	if status == NO_ROOM {
-		w.outbuf = make([]byte, len(w.outbuf)*2)
-		goto retry
-	}
-
-	if status == STATE_ONLY {
-		outpos += size
-		goto retry
-	}
-
-	outpos += size
-
-	return w.wr.Write(w.outbuf[0:outpos])
-}