youtubebeat/vendor/github.com/elastic/beats/libbeat/common/match/optimize.go

274 lines
6 KiB
Go

// Licensed to Elasticsearch B.V. under one or more contributor
// license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright
// ownership. Elasticsearch B.V. licenses this file to you under
// the Apache License, Version 2.0 (the "License"); you may
// not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package match
import "regexp/syntax"
type trans func(*syntax.Regexp) (bool, *syntax.Regexp)
var transformations = []trans{
simplify,
uncapture,
trimLeft,
trimRight,
unconcat,
concatRepetition,
flattenRepetition,
}
// optimize runs minimal regular expression optimizations
// until fix-point.
func optimize(r *syntax.Regexp) *syntax.Regexp {
for {
changed := false
for _, t := range transformations {
var upd bool
upd, r = t(r)
changed = changed || upd
}
if changed == false {
return r
}
}
}
// Simplify regular expression by stdlib.
func simplify(r *syntax.Regexp) (bool, *syntax.Regexp) {
return false, r.Simplify()
}
// uncapture optimizes regular expression by removing capture groups from
// regular expression potentially allocating memory when executed.
func uncapture(r *syntax.Regexp) (bool, *syntax.Regexp) {
if r.Op == syntax.OpCapture {
// try to uncapture
if len(r.Sub) == 1 {
_, sub := uncapture(r.Sub[0])
return true, sub
}
tmp := *r
tmp.Op = syntax.OpConcat
r = &tmp
}
sub := make([]*syntax.Regexp, len(r.Sub))
modified := false
for i := range r.Sub {
var m bool
m, sub[i] = uncapture(r.Sub[i])
modified = modified || m
}
if !modified {
return false, r
}
tmp := *r
tmp.Sub = sub
return true, &tmp
}
// trimLeft removes not required '.*' from beginning of regular expressions.
func trimLeft(r *syntax.Regexp) (bool, *syntax.Regexp) {
if eqPrefixAnyRegex(r, patDotStar, patNullBeginDotStar) {
tmp := *r
tmp.Sub = tmp.Sub[1:]
return true, &tmp
}
return false, r
}
// trimRight removes not required '.*' from end of regular expressions.
func trimRight(r *syntax.Regexp) (bool, *syntax.Regexp) {
if eqSuffixAnyRegex(r, patDotStar, patNullEndDotStar) {
i := len(r.Sub) - 1
tmp := *r
tmp.Sub = tmp.Sub[0:i]
return true, &tmp
}
return false, r
}
// unconcat removes intermediate regular expression concatenations generated by
// parser if concatenation contains only 1 element. Removal of object from
// parse-tree can enable other optimization to fire.
func unconcat(r *syntax.Regexp) (bool, *syntax.Regexp) {
switch {
case r.Op == syntax.OpConcat && len(r.Sub) <= 1:
if len(r.Sub) == 1 {
return true, r.Sub[0]
}
return true, &syntax.Regexp{
Op: syntax.OpEmptyMatch,
Flags: r.Flags,
}
case r.Op == syntax.OpRepeat && r.Min == r.Max && r.Min == 1:
return true, r.Sub[0]
}
return false, r
}
// concatRepetition concatenates 2 consecutive repeated sub-patterns into a
// repetition of length 2.
func concatRepetition(r *syntax.Regexp) (bool, *syntax.Regexp) {
if r.Op != syntax.OpConcat {
// don't iterate sub-expressions if top-level is no OpConcat
return false, r
}
// check if concatenated op is already a repetition
if isConcatRepetition(r) {
return false, r
}
// concatenate repetitions in sub-expressions first
var subs []*syntax.Regexp
changed := false
for _, sub := range r.Sub {
changedSub, tmp := concatRepetition(sub)
changed = changed || changedSub
subs = append(subs, tmp)
}
var concat []*syntax.Regexp
lastMerged := -1
for i, j := 0, 1; j < len(subs); i, j = j, j+1 {
if subs[i].Op == syntax.OpRepeat && eqRegex(subs[i].Sub[0], subs[j]) {
r := subs[i]
concat = append(concat,
&syntax.Regexp{
Op: syntax.OpRepeat,
Sub: r.Sub,
Min: r.Min + 1,
Max: r.Max + 1,
Flags: r.Flags,
},
)
lastMerged = j
changed = true
j++
continue
}
if isConcatRepetition(subs[i]) && eqRegex(subs[i].Sub[0], subs[j]) {
r := subs[i]
concat = append(concat,
&syntax.Regexp{
Op: syntax.OpConcat,
Sub: append(r.Sub, r.Sub[0]),
Flags: r.Flags,
},
)
lastMerged = j
changed = true
j++
continue
}
if eqRegex(subs[i], subs[j]) {
r := subs[i]
concat = append(concat,
&syntax.Regexp{
Op: syntax.OpRepeat,
Sub: []*syntax.Regexp{r},
Min: 2,
Max: 2,
Flags: r.Flags,
},
)
lastMerged = j
changed = true
j++
continue
}
concat = append(concat, subs[i])
}
if lastMerged+1 != len(subs) {
concat = append(concat, subs[len(subs)-1])
}
r = &syntax.Regexp{
Op: syntax.OpConcat,
Sub: concat,
Flags: r.Flags,
}
return changed, r
}
// flattenRepetition flattens nested repetitions
func flattenRepetition(r *syntax.Regexp) (bool, *syntax.Regexp) {
if r.Op != syntax.OpConcat {
// don't iterate sub-expressions if top-level is no OpConcat
return false, r
}
sub := r.Sub
inRepetition := false
if isConcatRepetition(r) {
sub = sub[:1]
inRepetition = true
// create flattened regex repetition multiplying count
// if nexted expression is also a repetition
if s := sub[0]; isConcatRepetition(s) {
count := len(s.Sub) * len(r.Sub)
return true, &syntax.Regexp{
Op: syntax.OpRepeat,
Sub: s.Sub[:1],
Min: count,
Max: count,
Flags: r.Flags | s.Flags,
}
}
}
// recursively check if we can flatten sub-expressions
changed := false
for i, s := range sub {
upd, tmp := flattenRepetition(s)
changed = changed || upd
sub[i] = tmp
}
if !changed {
return false, r
}
// fix up top-level repetition with modified one
tmp := *r
if inRepetition {
for i := range r.Sub {
tmp.Sub[i] = sub[0]
}
} else {
tmp.Sub = sub
}
return changed, &tmp
}