139 lines
3.8 KiB
Go
139 lines
3.8 KiB
Go
// Licensed to Elasticsearch B.V. under one or more contributor
|
|
// license agreements. See the NOTICE file distributed with
|
|
// this work for additional information regarding copyright
|
|
// ownership. Elasticsearch B.V. licenses this file to you under
|
|
// the Apache License, Version 2.0 (the "License"); you may
|
|
// not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
package sys
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"strings"
|
|
"unicode/utf16"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// The conditions replacementChar==unicode.ReplacementChar and
|
|
// maxRune==unicode.MaxRune are verified in the tests.
|
|
// Defining them locally avoids this package depending on package unicode.
|
|
|
|
const (
|
|
replacementChar = '\uFFFD' // Unicode replacement character
|
|
maxRune = '\U0010FFFF' // Maximum valid Unicode code point.
|
|
)
|
|
|
|
const (
|
|
// 0xd800-0xdc00 encodes the high 10 bits of a pair.
|
|
// 0xdc00-0xe000 encodes the low 10 bits of a pair.
|
|
// the value is those 20 bits plus 0x10000.
|
|
surr1 = 0xd800
|
|
surr2 = 0xdc00
|
|
surr3 = 0xe000
|
|
|
|
surrSelf = 0x10000
|
|
)
|
|
|
|
var ErrBufferTooSmall = errors.New("buffer too small")
|
|
|
|
func UTF16ToUTF8Bytes(in []byte, out io.Writer) error {
|
|
if len(in)%2 != 0 {
|
|
return fmt.Errorf("input buffer must have an even length (length=%d)", len(in))
|
|
}
|
|
|
|
var runeBuf [4]byte
|
|
var v1, v2 uint16
|
|
for i := 0; i < len(in); i += 2 {
|
|
v1 = uint16(in[i]) | uint16(in[i+1])<<8
|
|
// Stop at null-terminator.
|
|
if v1 == 0 {
|
|
return nil
|
|
}
|
|
|
|
switch {
|
|
case v1 < surr1, surr3 <= v1:
|
|
n := utf8.EncodeRune(runeBuf[:], rune(v1))
|
|
out.Write(runeBuf[:n])
|
|
case surr1 <= v1 && v1 < surr2 && len(in) > i+2:
|
|
v2 = uint16(in[i+2]) | uint16(in[i+3])<<8
|
|
if surr2 <= v2 && v2 < surr3 {
|
|
// valid surrogate sequence
|
|
r := utf16.DecodeRune(rune(v1), rune(v2))
|
|
n := utf8.EncodeRune(runeBuf[:], r)
|
|
out.Write(runeBuf[:n])
|
|
}
|
|
i += 2
|
|
default:
|
|
// invalid surrogate sequence
|
|
n := utf8.EncodeRune(runeBuf[:], replacementChar)
|
|
out.Write(runeBuf[:n])
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// UTF16BytesToString returns a string that is decoded from the UTF-16 bytes.
|
|
// The byte slice must be of even length otherwise an error will be returned.
|
|
// The integer returned is the offset to the start of the next string with
|
|
// buffer if it exists, otherwise -1 is returned.
|
|
func UTF16BytesToString(b []byte) (string, int, error) {
|
|
if len(b)%2 != 0 {
|
|
return "", 0, fmt.Errorf("Slice must have an even length (length=%d)", len(b))
|
|
}
|
|
|
|
offset := -1
|
|
|
|
// Find the null terminator if it exists and re-slice the b.
|
|
if nullIndex := indexNullTerminator(b); nullIndex > -1 {
|
|
if len(b) > nullIndex+2 {
|
|
offset = nullIndex + 2
|
|
}
|
|
|
|
b = b[:nullIndex]
|
|
}
|
|
|
|
s := make([]uint16, len(b)/2)
|
|
for i := range s {
|
|
s[i] = uint16(b[i*2]) + uint16(b[(i*2)+1])<<8
|
|
}
|
|
|
|
return string(utf16.Decode(s)), offset, nil
|
|
}
|
|
|
|
// indexNullTerminator returns the index of a null terminator within a buffer
|
|
// containing UTF-16 encoded data. If the null terminator is not found -1 is
|
|
// returned.
|
|
func indexNullTerminator(b []byte) int {
|
|
if len(b) < 2 {
|
|
return -1
|
|
}
|
|
|
|
for i := 0; i < len(b); i += 2 {
|
|
if b[i] == 0 && b[i+1] == 0 {
|
|
return i
|
|
}
|
|
}
|
|
|
|
return -1
|
|
}
|
|
|
|
// RemoveWindowsLineEndings replaces carriage return line feed (CRLF) with
|
|
// line feed (LF) and trims any newline character that may exist at the end
|
|
// of the string.
|
|
func RemoveWindowsLineEndings(s string) string {
|
|
s = strings.Replace(s, "\r\n", "\n", -1)
|
|
return strings.TrimRight(s, "\n")
|
|
}
|