youtubebeat/vendor/github.com/elastic/beats/filebeat/input/log/input.go

// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package log

import (
	"errors"
	"fmt"
	"os"
	"path/filepath"
	"sort"
	"strings"
	"time"

	"github.com/elastic/beats/filebeat/channel"
	"github.com/elastic/beats/filebeat/harvester"
	"github.com/elastic/beats/filebeat/input"
	"github.com/elastic/beats/filebeat/input/file"
	"github.com/elastic/beats/filebeat/util"
	"github.com/elastic/beats/libbeat/common"
	"github.com/elastic/beats/libbeat/common/atomic"
	"github.com/elastic/beats/libbeat/logp"
	"github.com/elastic/beats/libbeat/monitoring"
)

const (
	recursiveGlobDepth = 8
	harvesterErrMsg    = "Harvester could not be started on new file: %s, Err: %s"
)

var (
	filesRenamed     = monitoring.NewInt(nil, "filebeat.input.log.files.renamed")
	filesTruncated   = monitoring.NewInt(nil, "filebeat.input.log.files.truncated")
	harvesterSkipped = monitoring.NewInt(nil, "filebeat.harvester.skipped")

	errHarvesterLimit = errors.New("harvester limit reached")
)

func init() {
	err := input.Register("log", NewInput)
	if err != nil {
		panic(err)
	}
}

// Input contains the input and its config
type Input struct {
	cfg           *common.Config
	config        config
	states        *file.States
	harvesters    *harvester.Registry
	outlet        channel.Outleter
	stateOutlet   channel.Outleter
	done          chan struct{}
	numHarvesters atomic.Uint32
	meta          map[string]string
}

// NewInput instantiates a new Log
func NewInput(
	cfg *common.Config,
	outlet channel.Connector,
	context input.Context,
) (input.Input, error) {

	// Note: underlying output.
	//  The input and harvester do have different requirements
	//  on the timings the outlets must be closed/unblocked.
	//  The outlet generated here is the underlying outlet, only closed
	//  once all workers have been shut down.
	//  For state updates and events, separate sub-outlets will be used.
	out, err := outlet(cfg, context.DynamicFields)
	if err != nil {
		return nil, err
	}

	// stateOut will only be unblocked if the beat is shut down.
	// otherwise it can block on a full publisher pipeline, so state updates
	// can be forwarded correctly to the registrar.
	stateOut := channel.CloseOnSignal(channel.SubOutlet(out), context.BeatDone)

	meta := context.Meta
	if len(meta) == 0 {
		meta = nil
	}

	p := &Input{
		config:      defaultConfig,
		cfg:         cfg,
		harvesters:  harvester.NewRegistry(),
		outlet:      out,
		stateOutlet: stateOut,
		states:      file.NewStates(),
		done:        context.Done,
		meta:        meta,
	}

	if err := cfg.Unpack(&p.config); err != nil {
		return nil, err
	}
	if err := p.config.resolveRecursiveGlobs(); err != nil {
		return nil, fmt.Errorf("Failed to resolve recursive globs in config: %v", err)
	}
	if err := p.config.normalizeGlobPatterns(); err != nil {
		return nil, fmt.Errorf("Failed to normalize globs patterns: %v", err)
	}

	// Create empty harvester to check if configs are fine
	// TODO: Do config validation instead
	_, err = p.createHarvester(file.State{}, nil)
	if err != nil {
		return nil, err
	}

	if len(p.config.Paths) == 0 {
		return nil, fmt.Errorf("each input must have at least one path defined")
	}

	err = p.loadStates(context.States)
	if err != nil {
		return nil, err
	}

	logp.Info("Configured paths: %v", p.config.Paths)

	return p, nil
}

// LoadStates loads states into input
// It goes through all states coming from the registry. Only the states which match the glob patterns of
// the input will be loaded and updated. All other states will not be touched.
func (p *Input) loadStates(states []file.State) error {
	logp.Debug("input", "exclude_files: %s. Number of stats: %d", p.config.ExcludeFiles, len(states))

	for _, state := range states {
		// Check if state source belongs to this input. If yes, update the state.
		if p.matchesFile(state.Source) && p.matchesMeta(state.Meta) {
			state.TTL = -1

			// In case a input is tried to be started with an unfinished state matching the glob pattern
			if !state.Finished {
				return fmt.Errorf("Can only start an input when all related states are finished: %+v", state)
			}

			// Update input states and send new states to registry
			err := p.updateState(state)
			if err != nil {
				logp.Err("Problem putting initial state: %+v", err)
				return err
			}
		}
	}

	logp.Debug("input", "input with previous states loaded: %v", p.states.Count())
	return nil
}

// Run runs the input
func (p *Input) Run() {
	logp.Debug("input", "Start next scan")

	// TailFiles is like ignore_older = 1ns and only on startup
	if p.config.TailFiles {
		ignoreOlder := p.config.IgnoreOlder

		// Overwrite ignore_older for the first scan
		p.config.IgnoreOlder = 1
		defer func() {
			// Reset ignore_older after first run
			p.config.IgnoreOlder = ignoreOlder
			// Disable tail_files after the first run
			p.config.TailFiles = false
		}()
	}
	p.scan()

	// It is important that a first scan is run before cleanup to make sure all new states are read first
	if p.config.CleanInactive > 0 || p.config.CleanRemoved {
		beforeCount := p.states.Count()
		cleanedStates, pendingClean := p.states.Cleanup()
		logp.Debug("input", "input states cleaned up. Before: %d, After: %d, Pending: %d",
			beforeCount, beforeCount-cleanedStates, pendingClean)
	}

	// Marking removed files to be cleaned up. Cleanup happens after next scan to make sure all states are updated first
	if p.config.CleanRemoved {
		for _, state := range p.states.GetStates() {
			// os.Stat will return an error in case the file does not exist
			stat, err := os.Stat(state.Source)
			if err != nil {
				if os.IsNotExist(err) {
					p.removeState(state)
					logp.Debug("input", "Remove state for file as file removed: %s", state.Source)
				} else {
					logp.Err("input state for %s was not removed: %s", state.Source, err)
				}
			} else {
				// Check if existing source on disk and state are the same. Remove if not the case.
				newState := file.NewState(stat, state.Source, p.config.Type, p.meta)
				if !newState.FileStateOS.IsSame(state.FileStateOS) {
					p.removeState(state)
					logp.Debug("input", "Remove state for file as file removed or renamed: %s", state.Source)
				}
			}
		}
	}
}

func (p *Input) removeState(state file.State) {
	// Only clean up files where state is Finished
	if !state.Finished {
		logp.Debug("input", "State for file not removed because harvester not finished: %s", state.Source)
		return
	}

	state.TTL = 0
	err := p.updateState(state)
	if err != nil {
		logp.Err("File cleanup state update error: %s", err)
	}
}

// getFiles returns all files which have to be harvested
// All globs are expanded and then directory and excluded files are removed
func (p *Input) getFiles() map[string]os.FileInfo {
	paths := map[string]os.FileInfo{}

	for _, path := range p.config.Paths {
		matches, err := filepath.Glob(path)
		if err != nil {
			logp.Err("glob(%s) failed: %v", path, err)
			continue
		}

	OUTER:
		// Check any matched files to see if we need to start a harvester
		for _, file := range matches {

			// check if the file is in the exclude_files list
			if p.isFileExcluded(file) {
				logp.Debug("input", "Exclude file: %s", file)
				continue
			}

			// Fetch Lstat File info to detected also symlinks
			fileInfo, err := os.Lstat(file)
			if err != nil {
				logp.Debug("input", "lstat(%s) failed: %s", file, err)
				continue
			}

			if fileInfo.IsDir() {
				logp.Debug("input", "Skipping directory: %s", file)
				continue
			}

			isSymlink := fileInfo.Mode()&os.ModeSymlink > 0
			if isSymlink && !p.config.Symlinks {
				logp.Debug("input", "File %s skipped as it is a symlink.", file)
				continue
			}

			// Fetch Stat file info which fetches the inode. In case of a symlink, the original inode is fetched
			fileInfo, err = os.Stat(file)
			if err != nil {
				logp.Debug("input", "stat(%s) failed: %s", file, err)
				continue
			}

			// If symlink is enabled, it is checked that original is not part of same input
			// It original is harvested by other input, states will potentially overwrite each other
			if p.config.Symlinks {
				for _, finfo := range paths {
					if os.SameFile(finfo, fileInfo) {
						logp.Info("Same file found as symlink and originap. Skipping file: %s", file)
						continue OUTER
					}
				}
			}

			paths[file] = fileInfo
		}
	}

	return paths
}

// matchesFile returns true in case the given filePath is part of this input, means matches its glob patterns
func (p *Input) matchesFile(filePath string) bool {
	// Path is cleaned to ensure we always compare clean paths
	filePath = filepath.Clean(filePath)

	for _, glob := range p.config.Paths {

		// Glob is cleaned to ensure we always compare clean paths
		glob = filepath.Clean(glob)

		// Evaluate if glob matches filePath
		match, err := filepath.Match(glob, filePath)
		if err != nil {
			logp.Debug("input", "Error matching glob: %s", err)
			continue
		}

		// Check if file is not excluded
		if match && !p.isFileExcluded(filePath) {
			return true
		}
	}
	return false
}

// matchesMeta returns true in case the given meta is equal to the one of this input, false if not
func (p *Input) matchesMeta(meta map[string]string) bool {
	if len(meta) != len(p.meta) {
		return false
	}

	for k, v := range p.meta {
		if meta[k] != v {
			return false
		}
	}

	return true
}

type FileSortInfo struct {
	info os.FileInfo
	path string
}

func getSortInfos(paths map[string]os.FileInfo) []FileSortInfo {
	sortInfos := make([]FileSortInfo, 0, len(paths))
	for path, info := range paths {
		sortInfo := FileSortInfo{info: info, path: path}
		sortInfos = append(sortInfos, sortInfo)
	}

	return sortInfos
}

func getSortedFiles(scanOrder string, scanSort string, sortInfos []FileSortInfo) ([]FileSortInfo, error) {
	var sortFunc func(i, j int) bool
	switch scanSort {
	case "modtime":
		switch scanOrder {
		case "asc":
			sortFunc = func(i, j int) bool {
				return sortInfos[i].info.ModTime().Before(sortInfos[j].info.ModTime())
			}
		case "desc":
			sortFunc = func(i, j int) bool {
				return sortInfos[i].info.ModTime().After(sortInfos[j].info.ModTime())
			}
		default:
			return nil, fmt.Errorf("Unexpected value for scan.order: %v", scanOrder)
		}
	case "filename":
		switch scanOrder {
		case "asc":
			sortFunc = func(i, j int) bool {
				return strings.Compare(sortInfos[i].info.Name(), sortInfos[j].info.Name()) < 0
			}
		case "desc":
			sortFunc = func(i, j int) bool {
				return strings.Compare(sortInfos[i].info.Name(), sortInfos[j].info.Name()) > 0
			}
		default:
			return nil, fmt.Errorf("Unexpected value for scan.order: %v", scanOrder)
		}
	default:
		return nil, fmt.Errorf("Unexpected value for scan.sort: %v", scanSort)
	}

	if sortFunc != nil {
		sort.Slice(sortInfos, sortFunc)
	}

	return sortInfos, nil
}

func getFileState(path string, info os.FileInfo, p *Input) (file.State, error) {
	var err error
	var absolutePath string
	absolutePath, err = filepath.Abs(path)
	if err != nil {
		return file.State{}, fmt.Errorf("could not fetch abs path for file %s: %s", absolutePath, err)
	}
	logp.Debug("input", "Check file for harvesting: %s", absolutePath)
	// Create new state for comparison
	newState := file.NewState(info, absolutePath, p.config.Type, p.meta)
	return newState, nil
}

func getKeys(paths map[string]os.FileInfo) []string {
	files := make([]string, 0)
	for file := range paths {
		files = append(files, file)
	}
	return files
}

// Scan starts a scanGlob for each provided path/glob
func (p *Input) scan() {
	var sortInfos []FileSortInfo
	var files []string

	paths := p.getFiles()

	var err error

	if p.config.ScanSort != "" {
		sortInfos, err = getSortedFiles(p.config.ScanOrder, p.config.ScanSort, getSortInfos(paths))
		if err != nil {
			logp.Err("Failed to sort files during scan due to error %s", err)
		}
	}

	if sortInfos == nil {
		files = getKeys(paths)
	}

	for i := 0; i < len(paths); i++ {

		var path string
		var info os.FileInfo

		if sortInfos == nil {
			path = files[i]
			info = paths[path]
		} else {
			path = sortInfos[i].path
			info = sortInfos[i].info
		}

		select {
		case <-p.done:
			logp.Info("Scan aborted because input stopped.")
			return
		default:
		}

		newState, err := getFileState(path, info, p)
		if err != nil {
			logp.Err("Skipping file %s due to error %s", path, err)
		}

		// Load last state
		lastState := p.states.FindPrevious(newState)

		// Ignores all files which fall under ignore_older
		if p.isIgnoreOlder(newState) {
			err := p.handleIgnoreOlder(lastState, newState)
			if err != nil {
				logp.Err("Updating ignore_older state error: %s", err)
			}
			continue
		}

		// Decides if previous state exists
		if lastState.IsEmpty() {
			logp.Debug("input", "Start harvester for new file: %s", newState.Source)
			err := p.startHarvester(newState, 0)
			if err == errHarvesterLimit {
				logp.Debug("input", harvesterErrMsg, newState.Source, err)
				continue
			}
			if err != nil {
				logp.Err(harvesterErrMsg, newState.Source, err)
			}
		} else {
			p.harvestExistingFile(newState, lastState)
		}
	}
}

// harvestExistingFile continues harvesting a file with a known state if needed
func (p *Input) harvestExistingFile(newState file.State, oldState file.State) {
	logp.Debug("input", "Update existing file for harvesting: %s, offset: %v", newState.Source, oldState.Offset)

	// No harvester is running for the file, start a new harvester
	// It is important here that only the size is checked and not modification time, as modification time could be incorrect on windows
	// https://blogs.technet.microsoft.com/asiasupp/2010/12/14/file-date-modified-property-are-not-updating-while-modifying-a-file-without-closing-it/
	if oldState.Finished && newState.Fileinfo.Size() > oldState.Offset {
		// Resume harvesting of an old file we've stopped harvesting from
		// This could also be an issue with force_close_older that a new harvester is started after each scan but not needed?
		// One problem with comparing modTime is that it is in seconds, and scans can happen more then once a second
		logp.Debug("input", "Resuming harvesting of file: %s, offset: %d, new size: %d", newState.Source, oldState.Offset, newState.Fileinfo.Size())
		err := p.startHarvester(newState, oldState.Offset)
		if err != nil {
			logp.Err("Harvester could not be started on existing file: %s, Err: %s", newState.Source, err)
		}
		return
	}

	// File size was reduced -> truncated file
	if oldState.Finished && newState.Fileinfo.Size() < oldState.Offset {
		logp.Debug("input", "Old file was truncated. Starting from the beginning: %s, offset: %d, new size: %d ", newState.Source, newState.Fileinfo.Size())
		err := p.startHarvester(newState, 0)
		if err != nil {
			logp.Err("Harvester could not be started on truncated file: %s, Err: %s", newState.Source, err)
		}

		filesTruncated.Add(1)
		return
	}

	// Check if file was renamed
	if oldState.Source != "" && oldState.Source != newState.Source {
		// This does not start a new harvester as it is assume that the older harvester is still running
		// or no new lines were detected. It sends only an event status update to make sure the new name is persisted.
		logp.Debug("input", "File rename was detected: %s -> %s, Current offset: %v", oldState.Source, newState.Source, oldState.Offset)

		if oldState.Finished {
			logp.Debug("input", "Updating state for renamed file: %s -> %s, Current offset: %v", oldState.Source, newState.Source, oldState.Offset)
			// Update state because of file rotation
			oldState.Source = newState.Source
			err := p.updateState(oldState)
			if err != nil {
				logp.Err("File rotation state update error: %s", err)
			}

			filesRenamed.Add(1)
		} else {
			logp.Debug("input", "File rename detected but harvester not finished yet.")
		}
	}

	if !oldState.Finished {
		// Nothing to do. Harvester is still running and file was not renamed
		logp.Debug("input", "Harvester for file is still running: %s", newState.Source)
	} else {
		logp.Debug("input", "File didn't change: %s", newState.Source)
	}
}

// handleIgnoreOlder handles states which fall under ignore older
// Based on the state information it is decided if the state information has to be updated or not
func (p *Input) handleIgnoreOlder(lastState, newState file.State) error {
	logp.Debug("input", "Ignore file because ignore_older reached: %s", newState.Source)

	if !lastState.IsEmpty() {
		if !lastState.Finished {
			logp.Info("File is falling under ignore_older before harvesting is finished. Adjust your close_* settings: %s", newState.Source)
		}
		// Old state exist, no need to update it
		return nil
	}

	// Make sure file is not falling under clean_inactive yet
	if p.isCleanInactive(newState) {
		logp.Debug("input", "Do not write state for ignore_older because clean_inactive reached")
		return nil
	}

	// Set offset to end of file to be consistent with files which were harvested before
	// See https://github.com/elastic/beats/pull/2907
	newState.Offset = newState.Fileinfo.Size()

	// Write state for ignore_older file as none exists yet
	newState.Finished = true
	err := p.updateState(newState)
	if err != nil {
		return err
	}

	return nil
}

// isFileExcluded checks if the given path should be excluded
func (p *Input) isFileExcluded(file string) bool {
	patterns := p.config.ExcludeFiles
	return len(patterns) > 0 && harvester.MatchAny(patterns, file)
}

// isIgnoreOlder checks if the given state reached ignore_older
func (p *Input) isIgnoreOlder(state file.State) bool {
	// ignore_older is disable
	if p.config.IgnoreOlder == 0 {
		return false
	}

	modTime := state.Fileinfo.ModTime()
	if time.Since(modTime) > p.config.IgnoreOlder {
		return true
	}

	return false
}

// isCleanInactive checks if the given state false under clean_inactive
func (p *Input) isCleanInactive(state file.State) bool {
	// clean_inactive is disable
	if p.config.CleanInactive <= 0 {
		return false
	}

	modTime := state.Fileinfo.ModTime()
	if time.Since(modTime) > p.config.CleanInactive {
		return true
	}

	return false
}

// subOutletWrap returns a factory method that will wrap the passed outlet
// in a SubOutlet and memoize the result so the wrapping is done only once.
func subOutletWrap(outlet channel.Outleter) func() channel.Outleter {
	var subOutlet channel.Outleter
	return func() channel.Outleter {
		if subOutlet == nil {
			subOutlet = channel.SubOutlet(outlet)
		}
		return subOutlet
	}
}

// createHarvester creates a new harvester instance from the given state
func (p *Input) createHarvester(state file.State, onTerminate func()) (*Harvester, error) {
	// Each wraps the outlet, for closing the outlet individually
	h, err := NewHarvester(
		p.cfg,
		state,
		p.states,
		func(d *util.Data) bool {
			return p.stateOutlet.OnEvent(d)
		},
		subOutletWrap(p.outlet),
	)
	if err == nil {
		h.onTerminate = onTerminate
	}
	return h, err
}

// startHarvester starts a new harvester with the given offset
// In case the HarvesterLimit is reached, an error is returned
func (p *Input) startHarvester(state file.State, offset int64) error {
	if p.numHarvesters.Inc() > p.config.HarvesterLimit && p.config.HarvesterLimit > 0 {
		p.numHarvesters.Dec()
		harvesterSkipped.Add(1)
		return errHarvesterLimit
	}
	// Set state to "not" finished to indicate that a harvester is running
	state.Finished = false
	state.Offset = offset

	// Create harvester with state
	h, err := p.createHarvester(state, func() { p.numHarvesters.Dec() })
	if err != nil {
		p.numHarvesters.Dec()
		return err
	}

	err = h.Setup()
	if err != nil {
		p.numHarvesters.Dec()
		return fmt.Errorf("error setting up harvester: %s", err)
	}

	// Update state before staring harvester
	// This makes sure the states is set to Finished: false
	// This is synchronous state update as part of the scan
	h.SendStateUpdate()

	if err = p.harvesters.Start(h); err != nil {
		p.numHarvesters.Dec()
	}
	return err
}

// updateState updates the input state and forwards the event to the spooler
// All state updates done by the input itself are synchronous to make sure not states are overwritten
func (p *Input) updateState(state file.State) error {
	// Add ttl if cleanOlder is enabled and TTL is not already 0
	if p.config.CleanInactive > 0 && state.TTL != 0 {
		state.TTL = p.config.CleanInactive
	}

	if len(state.Meta) == 0 {
		state.Meta = nil
	}

	// Update first internal state
	p.states.Update(state)

	data := util.NewData()
	data.SetState(state)
	ok := p.outlet.OnEvent(data)
	if !ok {
		logp.Info("input outlet closed")
		return errors.New("input outlet closed")
	}

	return nil
}

// Wait waits for the all harvesters to complete and only then call stop
func (p *Input) Wait() {
	p.harvesters.WaitForCompletion()
	p.Stop()
}

// Stop stops all harvesters and then stops the input
func (p *Input) Stop() {
	// Stop all harvesters
	// In case the beatDone channel is closed, this will not wait for completion
	// Otherwise Stop will wait until output is complete
	p.harvesters.Stop()

	// close state updater
	p.stateOutlet.Close()

	// stop all communication between harvesters and publisher pipeline
	p.outlet.Close()
}