youtubebeat/vendor/github.com/temoto/robotstxt/scanner.go

206 lines
3.7 KiB
Go

package robotstxt
import (
"bytes"
"fmt"
"go/token"
"io"
"os"
"unicode/utf8"
)
type byteScanner struct {
ErrorCount int
Quiet bool
buf []byte
pos token.Position
lastChunk bool
ch rune
keyTokenFound bool
}
var WhitespaceChars = []rune{' ', '\t', '\v'}
func newByteScanner(srcname string, quiet bool) *byteScanner {
return &byteScanner{
Quiet: quiet,
ch: -1,
pos: token.Position{Filename: srcname},
}
}
func (s *byteScanner) Feed(input []byte, end bool) error {
s.buf = input
s.pos.Offset = 0
s.pos.Line = 1
s.pos.Column = 1
s.lastChunk = end
// Read first char into look-ahead buffer `s.ch`.
if err := s.nextChar(); err != nil {
return err
}
// Skip UTF-8 byte order mark
if s.ch == 65279 {
s.nextChar()
s.pos.Column = 1
}
return nil
}
func (s *byteScanner) GetPosition() token.Position {
return s.pos
}
func (s *byteScanner) Scan() (string, error) {
//println("--- Scan(). Offset / len(s.buf): ", s.pos.Offset, len(s.buf))
for {
// Note Offset > len, not >=, so we can Scan last character.
if s.lastChunk && s.pos.Offset > len(s.buf) {
return "", io.EOF
}
s.skipSpace()
if s.ch == -1 {
return "", io.EOF
}
// EOL
if s.isEol() {
s.keyTokenFound = false
// skip subsequent newline chars
for s.ch != -1 && s.isEol() {
s.nextChar()
}
// emit newline as separate token
return "\n", nil
}
// skip comments
if s.ch == '#' {
s.keyTokenFound = false
s.skipUntilEol()
// s.state = "start"
if s.ch == -1 {
return "", io.EOF
}
// emit newline as separate token
return "\n", nil
}
// else we found something
break
}
/*
if s.state == "start" {
s.state = "key"
}
*/
var tok bytes.Buffer
tok.WriteRune(s.ch)
s.nextChar()
for s.ch != -1 && !s.isSpace() && !s.isEol() {
// Do not consider ":" to be a token separator if a first key token
// has already been found on this line (avoid cutting an absolute URL
// after the "http:")
if s.ch == ':' && !s.keyTokenFound {
// s.state = "pre-value"
s.nextChar()
s.keyTokenFound = true
break
}
tok.WriteRune(s.ch)
s.nextChar()
}
return tok.String(), nil
}
func (s *byteScanner) ScanAll() ([]string, error) {
var results []string
for {
t, err := s.Scan()
if t != "" {
results = append(results, t)
}
if err == io.EOF {
break
}
if err != nil {
return results, err
}
}
return results, nil
}
func (s *byteScanner) error(pos token.Position, msg string) {
s.ErrorCount++
if !s.Quiet {
fmt.Fprintf(os.Stderr, "robotstxt from %s: %s\n", pos.String(), msg)
}
}
func (s *byteScanner) isEol() bool {
return s.ch == '\n' || s.ch == '\r'
}
func (s *byteScanner) isSpace() bool {
for _, r := range WhitespaceChars {
if s.ch == r {
return true
}
}
return false
}
func (s *byteScanner) skipSpace() {
//println("--- string(ch): ", s.ch, ".")
for s.ch != -1 && s.isSpace() {
s.nextChar()
}
}
func (s *byteScanner) skipUntilEol() {
//println("--- string(ch): ", s.ch, ".")
for s.ch != -1 && !s.isEol() {
s.nextChar()
}
// skip subsequent newline chars
for s.ch != -1 && s.isEol() {
s.nextChar()
}
}
// Reads next Unicode char.
func (s *byteScanner) nextChar() error {
//println("--- nextChar(). Offset / len(s.buf): ", s.pos.Offset, len(s.buf))
if s.pos.Offset >= len(s.buf) {
s.ch = -1
return io.EOF
}
s.pos.Column++
if s.ch == '\n' {
s.pos.Line++
s.pos.Column = 1
}
r, w := rune(s.buf[s.pos.Offset]), 1
if r >= 0x80 {
r, w = utf8.DecodeRune(s.buf[s.pos.Offset:])
if r == utf8.RuneError && w == 1 {
s.error(s.pos, "illegal UTF-8 encoding")
}
}
s.pos.Column++
s.pos.Offset += w
s.ch = r
return nil
}