mirror of
https://codeberg.org/forgejo/forgejo.git
synced 2024-12-20 21:32:46 +03:00
103 lines
2.2 KiB
Go
103 lines
2.2 KiB
Go
|
package chardet
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
)
|
||
|
|
||
|
type recognizer2022 struct {
|
||
|
charset string
|
||
|
escapes [][]byte
|
||
|
}
|
||
|
|
||
|
func (r *recognizer2022) Match(input *recognizerInput) (output recognizerOutput) {
|
||
|
return recognizerOutput{
|
||
|
Charset: r.charset,
|
||
|
Confidence: r.matchConfidence(input.input),
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (r *recognizer2022) matchConfidence(input []byte) int {
|
||
|
var hits, misses, shifts int
|
||
|
input:
|
||
|
for i := 0; i < len(input); i++ {
|
||
|
c := input[i]
|
||
|
if c == 0x1B {
|
||
|
for _, esc := range r.escapes {
|
||
|
if bytes.HasPrefix(input[i+1:], esc) {
|
||
|
hits++
|
||
|
i += len(esc)
|
||
|
continue input
|
||
|
}
|
||
|
}
|
||
|
misses++
|
||
|
} else if c == 0x0E || c == 0x0F {
|
||
|
shifts++
|
||
|
}
|
||
|
}
|
||
|
if hits == 0 {
|
||
|
return 0
|
||
|
}
|
||
|
quality := (100*hits - 100*misses) / (hits + misses)
|
||
|
if hits+shifts < 5 {
|
||
|
quality -= (5 - (hits + shifts)) * 10
|
||
|
}
|
||
|
if quality < 0 {
|
||
|
quality = 0
|
||
|
}
|
||
|
return quality
|
||
|
}
|
||
|
|
||
|
var escapeSequences_2022JP = [][]byte{
|
||
|
{0x24, 0x28, 0x43}, // KS X 1001:1992
|
||
|
{0x24, 0x28, 0x44}, // JIS X 212-1990
|
||
|
{0x24, 0x40}, // JIS C 6226-1978
|
||
|
{0x24, 0x41}, // GB 2312-80
|
||
|
{0x24, 0x42}, // JIS X 208-1983
|
||
|
{0x26, 0x40}, // JIS X 208 1990, 1997
|
||
|
{0x28, 0x42}, // ASCII
|
||
|
{0x28, 0x48}, // JIS-Roman
|
||
|
{0x28, 0x49}, // Half-width katakana
|
||
|
{0x28, 0x4a}, // JIS-Roman
|
||
|
{0x2e, 0x41}, // ISO 8859-1
|
||
|
{0x2e, 0x46}, // ISO 8859-7
|
||
|
}
|
||
|
|
||
|
var escapeSequences_2022KR = [][]byte{
|
||
|
{0x24, 0x29, 0x43},
|
||
|
}
|
||
|
|
||
|
var escapeSequences_2022CN = [][]byte{
|
||
|
{0x24, 0x29, 0x41}, // GB 2312-80
|
||
|
{0x24, 0x29, 0x47}, // CNS 11643-1992 Plane 1
|
||
|
{0x24, 0x2A, 0x48}, // CNS 11643-1992 Plane 2
|
||
|
{0x24, 0x29, 0x45}, // ISO-IR-165
|
||
|
{0x24, 0x2B, 0x49}, // CNS 11643-1992 Plane 3
|
||
|
{0x24, 0x2B, 0x4A}, // CNS 11643-1992 Plane 4
|
||
|
{0x24, 0x2B, 0x4B}, // CNS 11643-1992 Plane 5
|
||
|
{0x24, 0x2B, 0x4C}, // CNS 11643-1992 Plane 6
|
||
|
{0x24, 0x2B, 0x4D}, // CNS 11643-1992 Plane 7
|
||
|
{0x4e}, // SS2
|
||
|
{0x4f}, // SS3
|
||
|
}
|
||
|
|
||
|
func newRecognizer_2022JP() *recognizer2022 {
|
||
|
return &recognizer2022{
|
||
|
"ISO-2022-JP",
|
||
|
escapeSequences_2022JP,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func newRecognizer_2022KR() *recognizer2022 {
|
||
|
return &recognizer2022{
|
||
|
"ISO-2022-KR",
|
||
|
escapeSequences_2022KR,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func newRecognizer_2022CN() *recognizer2022 {
|
||
|
return &recognizer2022{
|
||
|
"ISO-2022-CN",
|
||
|
escapeSequences_2022CN,
|
||
|
}
|
||
|
}
|