195 lines
5.2 KiB
Go
195 lines
5.2 KiB
Go
|
// Licensed to Elasticsearch B.V. under one or more contributor
|
||
|
// license agreements. See the NOTICE file distributed with
|
||
|
// this work for additional information regarding copyright
|
||
|
// ownership. Elasticsearch B.V. licenses this file to you under
|
||
|
// the Apache License, Version 2.0 (the "License"); you may
|
||
|
// not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing,
|
||
|
// software distributed under the License is distributed on an
|
||
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||
|
// KIND, either express or implied. See the License for the
|
||
|
// specific language governing permissions and limitations
|
||
|
// under the License.
|
||
|
|
||
|
package readfile
|
||
|
|
||
|
import (
|
||
|
"io"
|
||
|
|
||
|
"golang.org/x/text/encoding"
|
||
|
"golang.org/x/text/transform"
|
||
|
|
||
|
"github.com/elastic/beats/libbeat/common/streambuf"
|
||
|
"github.com/elastic/beats/libbeat/logp"
|
||
|
)
|
||
|
|
||
|
// lineReader reads lines from underlying reader, decoding the input stream
|
||
|
// using the configured codec. The reader keeps track of bytes consumed
|
||
|
// from raw input stream for every decoded line.
|
||
|
type LineReader struct {
|
||
|
reader io.Reader
|
||
|
codec encoding.Encoding
|
||
|
bufferSize int
|
||
|
nl []byte
|
||
|
inBuffer *streambuf.Buffer
|
||
|
outBuffer *streambuf.Buffer
|
||
|
inOffset int // input buffer read offset
|
||
|
byteCount int // number of bytes decoded from input buffer into output buffer
|
||
|
decoder transform.Transformer
|
||
|
}
|
||
|
|
||
|
// New creates a new reader object
|
||
|
func NewLineReader(input io.Reader, codec encoding.Encoding, bufferSize int) (*LineReader, error) {
|
||
|
encoder := codec.NewEncoder()
|
||
|
|
||
|
// Create newline char based on encoding
|
||
|
nl, _, err := transform.Bytes(encoder, []byte{'\n'})
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
|
||
|
return &LineReader{
|
||
|
reader: input,
|
||
|
codec: codec,
|
||
|
bufferSize: bufferSize,
|
||
|
nl: nl,
|
||
|
decoder: codec.NewDecoder(),
|
||
|
inBuffer: streambuf.New(nil),
|
||
|
outBuffer: streambuf.New(nil),
|
||
|
}, nil
|
||
|
}
|
||
|
|
||
|
// Next reads the next line until the new line character
|
||
|
func (r *LineReader) Next() ([]byte, int, error) {
|
||
|
// This loop is need in case advance detects an line ending which turns out
|
||
|
// not to be one when decoded. If that is the case, reading continues.
|
||
|
for {
|
||
|
// read next 'potential' line from input buffer/reader
|
||
|
err := r.advance()
|
||
|
if err != nil {
|
||
|
return nil, 0, err
|
||
|
}
|
||
|
|
||
|
// Check last decoded byte really being '\n' also unencoded
|
||
|
// if not, continue reading
|
||
|
buf := r.outBuffer.Bytes()
|
||
|
|
||
|
// This can happen if something goes wrong during decoding
|
||
|
if len(buf) == 0 {
|
||
|
logp.Err("Empty buffer returned by advance")
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if buf[len(buf)-1] == '\n' {
|
||
|
break
|
||
|
} else {
|
||
|
logp.Debug("line", "Line ending char found which wasn't one: %s", buf[len(buf)-1])
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// output buffer contains complete line ending with '\n'. Extract
|
||
|
// byte slice from buffer and reset output buffer.
|
||
|
bytes, err := r.outBuffer.Collect(r.outBuffer.Len())
|
||
|
r.outBuffer.Reset()
|
||
|
if err != nil {
|
||
|
// This should never happen as otherwise we have a broken state
|
||
|
panic(err)
|
||
|
}
|
||
|
|
||
|
// return and reset consumed bytes count
|
||
|
sz := r.byteCount
|
||
|
r.byteCount = 0
|
||
|
return bytes, sz, nil
|
||
|
}
|
||
|
|
||
|
// Reads from the buffer until a new line character is detected
|
||
|
// Returns an error otherwise
|
||
|
func (r *LineReader) advance() error {
|
||
|
// Initial check if buffer has already a newLine character
|
||
|
idx := r.inBuffer.IndexFrom(r.inOffset, r.nl)
|
||
|
|
||
|
// fill inBuffer until '\n' sequence has been found in input buffer
|
||
|
for idx == -1 {
|
||
|
// increase search offset to reduce iterations on buffer when looping
|
||
|
newOffset := r.inBuffer.Len() - len(r.nl)
|
||
|
if newOffset > r.inOffset {
|
||
|
r.inOffset = newOffset
|
||
|
}
|
||
|
|
||
|
buf := make([]byte, r.bufferSize)
|
||
|
|
||
|
// try to read more bytes into buffer
|
||
|
n, err := r.reader.Read(buf)
|
||
|
|
||
|
// Appends buffer also in case of err
|
||
|
r.inBuffer.Append(buf[:n])
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
// empty read => return buffer error (more bytes required error)
|
||
|
if n == 0 {
|
||
|
return streambuf.ErrNoMoreBytes
|
||
|
}
|
||
|
|
||
|
// Check if buffer has newLine character
|
||
|
idx = r.inBuffer.IndexFrom(r.inOffset, r.nl)
|
||
|
}
|
||
|
|
||
|
// found encoded byte sequence for '\n' in buffer
|
||
|
// -> decode input sequence into outBuffer
|
||
|
sz, err := r.decode(idx + len(r.nl))
|
||
|
if err != nil {
|
||
|
logp.Err("Error decoding line: %s", err)
|
||
|
// In case of error increase size by unencoded length
|
||
|
sz = idx + len(r.nl)
|
||
|
}
|
||
|
|
||
|
// consume transformed bytes from input buffer
|
||
|
err = r.inBuffer.Advance(sz)
|
||
|
r.inBuffer.Reset()
|
||
|
|
||
|
// continue scanning input buffer from last position + 1
|
||
|
r.inOffset = idx + 1 - sz
|
||
|
if r.inOffset < 0 {
|
||
|
// fix inOffset if '\n' has encoding > 8bits + firl line has been decoded
|
||
|
r.inOffset = 0
|
||
|
}
|
||
|
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
func (r *LineReader) decode(end int) (int, error) {
|
||
|
var err error
|
||
|
buffer := make([]byte, 1024)
|
||
|
inBytes := r.inBuffer.Bytes()
|
||
|
start := 0
|
||
|
|
||
|
for start < end {
|
||
|
var nDst, nSrc int
|
||
|
|
||
|
nDst, nSrc, err = r.decoder.Transform(buffer, inBytes[start:end], false)
|
||
|
if err != nil {
|
||
|
// Check if error is different from destination buffer too short
|
||
|
if err != transform.ErrShortDst {
|
||
|
r.outBuffer.Write(inBytes[0:end])
|
||
|
start = end
|
||
|
break
|
||
|
}
|
||
|
|
||
|
// Reset error as decoding continues
|
||
|
err = nil
|
||
|
}
|
||
|
|
||
|
start += nSrc
|
||
|
r.outBuffer.Write(buffer[:nDst])
|
||
|
}
|
||
|
|
||
|
r.byteCount += start
|
||
|
return start, err
|
||
|
}
|