274 lines
6 KiB
Go
274 lines
6 KiB
Go
// Licensed to Elasticsearch B.V. under one or more contributor
|
|
// license agreements. See the NOTICE file distributed with
|
|
// this work for additional information regarding copyright
|
|
// ownership. Elasticsearch B.V. licenses this file to you under
|
|
// the Apache License, Version 2.0 (the "License"); you may
|
|
// not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
package match
|
|
|
|
import "regexp/syntax"
|
|
|
|
type trans func(*syntax.Regexp) (bool, *syntax.Regexp)
|
|
|
|
var transformations = []trans{
|
|
simplify,
|
|
uncapture,
|
|
trimLeft,
|
|
trimRight,
|
|
unconcat,
|
|
concatRepetition,
|
|
flattenRepetition,
|
|
}
|
|
|
|
// optimize runs minimal regular expression optimizations
|
|
// until fix-point.
|
|
func optimize(r *syntax.Regexp) *syntax.Regexp {
|
|
for {
|
|
changed := false
|
|
for _, t := range transformations {
|
|
var upd bool
|
|
upd, r = t(r)
|
|
changed = changed || upd
|
|
}
|
|
|
|
if changed == false {
|
|
return r
|
|
}
|
|
}
|
|
}
|
|
|
|
// Simplify regular expression by stdlib.
|
|
func simplify(r *syntax.Regexp) (bool, *syntax.Regexp) {
|
|
return false, r.Simplify()
|
|
}
|
|
|
|
// uncapture optimizes regular expression by removing capture groups from
|
|
// regular expression potentially allocating memory when executed.
|
|
func uncapture(r *syntax.Regexp) (bool, *syntax.Regexp) {
|
|
if r.Op == syntax.OpCapture {
|
|
// try to uncapture
|
|
if len(r.Sub) == 1 {
|
|
_, sub := uncapture(r.Sub[0])
|
|
return true, sub
|
|
}
|
|
|
|
tmp := *r
|
|
tmp.Op = syntax.OpConcat
|
|
r = &tmp
|
|
}
|
|
|
|
sub := make([]*syntax.Regexp, len(r.Sub))
|
|
modified := false
|
|
for i := range r.Sub {
|
|
var m bool
|
|
m, sub[i] = uncapture(r.Sub[i])
|
|
modified = modified || m
|
|
}
|
|
|
|
if !modified {
|
|
return false, r
|
|
}
|
|
|
|
tmp := *r
|
|
tmp.Sub = sub
|
|
return true, &tmp
|
|
}
|
|
|
|
// trimLeft removes not required '.*' from beginning of regular expressions.
|
|
func trimLeft(r *syntax.Regexp) (bool, *syntax.Regexp) {
|
|
if eqPrefixAnyRegex(r, patDotStar, patNullBeginDotStar) {
|
|
tmp := *r
|
|
tmp.Sub = tmp.Sub[1:]
|
|
return true, &tmp
|
|
}
|
|
|
|
return false, r
|
|
}
|
|
|
|
// trimRight removes not required '.*' from end of regular expressions.
|
|
func trimRight(r *syntax.Regexp) (bool, *syntax.Regexp) {
|
|
if eqSuffixAnyRegex(r, patDotStar, patNullEndDotStar) {
|
|
i := len(r.Sub) - 1
|
|
tmp := *r
|
|
tmp.Sub = tmp.Sub[0:i]
|
|
return true, &tmp
|
|
}
|
|
|
|
return false, r
|
|
}
|
|
|
|
// unconcat removes intermediate regular expression concatenations generated by
|
|
// parser if concatenation contains only 1 element. Removal of object from
|
|
// parse-tree can enable other optimization to fire.
|
|
func unconcat(r *syntax.Regexp) (bool, *syntax.Regexp) {
|
|
switch {
|
|
case r.Op == syntax.OpConcat && len(r.Sub) <= 1:
|
|
if len(r.Sub) == 1 {
|
|
return true, r.Sub[0]
|
|
}
|
|
|
|
return true, &syntax.Regexp{
|
|
Op: syntax.OpEmptyMatch,
|
|
Flags: r.Flags,
|
|
}
|
|
|
|
case r.Op == syntax.OpRepeat && r.Min == r.Max && r.Min == 1:
|
|
return true, r.Sub[0]
|
|
}
|
|
|
|
return false, r
|
|
}
|
|
|
|
// concatRepetition concatenates 2 consecutive repeated sub-patterns into a
|
|
// repetition of length 2.
|
|
func concatRepetition(r *syntax.Regexp) (bool, *syntax.Regexp) {
|
|
if r.Op != syntax.OpConcat {
|
|
// don't iterate sub-expressions if top-level is no OpConcat
|
|
return false, r
|
|
}
|
|
|
|
// check if concatenated op is already a repetition
|
|
if isConcatRepetition(r) {
|
|
return false, r
|
|
}
|
|
|
|
// concatenate repetitions in sub-expressions first
|
|
var subs []*syntax.Regexp
|
|
changed := false
|
|
for _, sub := range r.Sub {
|
|
changedSub, tmp := concatRepetition(sub)
|
|
changed = changed || changedSub
|
|
subs = append(subs, tmp)
|
|
}
|
|
|
|
var concat []*syntax.Regexp
|
|
lastMerged := -1
|
|
for i, j := 0, 1; j < len(subs); i, j = j, j+1 {
|
|
if subs[i].Op == syntax.OpRepeat && eqRegex(subs[i].Sub[0], subs[j]) {
|
|
r := subs[i]
|
|
concat = append(concat,
|
|
&syntax.Regexp{
|
|
Op: syntax.OpRepeat,
|
|
Sub: r.Sub,
|
|
Min: r.Min + 1,
|
|
Max: r.Max + 1,
|
|
Flags: r.Flags,
|
|
},
|
|
)
|
|
|
|
lastMerged = j
|
|
changed = true
|
|
j++
|
|
continue
|
|
}
|
|
|
|
if isConcatRepetition(subs[i]) && eqRegex(subs[i].Sub[0], subs[j]) {
|
|
r := subs[i]
|
|
concat = append(concat,
|
|
&syntax.Regexp{
|
|
Op: syntax.OpConcat,
|
|
Sub: append(r.Sub, r.Sub[0]),
|
|
Flags: r.Flags,
|
|
},
|
|
)
|
|
|
|
lastMerged = j
|
|
changed = true
|
|
j++
|
|
continue
|
|
}
|
|
|
|
if eqRegex(subs[i], subs[j]) {
|
|
r := subs[i]
|
|
concat = append(concat,
|
|
&syntax.Regexp{
|
|
Op: syntax.OpRepeat,
|
|
Sub: []*syntax.Regexp{r},
|
|
Min: 2,
|
|
Max: 2,
|
|
Flags: r.Flags,
|
|
},
|
|
)
|
|
|
|
lastMerged = j
|
|
changed = true
|
|
j++
|
|
continue
|
|
}
|
|
|
|
concat = append(concat, subs[i])
|
|
}
|
|
|
|
if lastMerged+1 != len(subs) {
|
|
concat = append(concat, subs[len(subs)-1])
|
|
}
|
|
|
|
r = &syntax.Regexp{
|
|
Op: syntax.OpConcat,
|
|
Sub: concat,
|
|
Flags: r.Flags,
|
|
}
|
|
return changed, r
|
|
}
|
|
|
|
// flattenRepetition flattens nested repetitions
|
|
func flattenRepetition(r *syntax.Regexp) (bool, *syntax.Regexp) {
|
|
if r.Op != syntax.OpConcat {
|
|
// don't iterate sub-expressions if top-level is no OpConcat
|
|
return false, r
|
|
}
|
|
|
|
sub := r.Sub
|
|
inRepetition := false
|
|
if isConcatRepetition(r) {
|
|
sub = sub[:1]
|
|
inRepetition = true
|
|
|
|
// create flattened regex repetition multiplying count
|
|
// if nexted expression is also a repetition
|
|
if s := sub[0]; isConcatRepetition(s) {
|
|
count := len(s.Sub) * len(r.Sub)
|
|
return true, &syntax.Regexp{
|
|
Op: syntax.OpRepeat,
|
|
Sub: s.Sub[:1],
|
|
Min: count,
|
|
Max: count,
|
|
Flags: r.Flags | s.Flags,
|
|
}
|
|
}
|
|
}
|
|
|
|
// recursively check if we can flatten sub-expressions
|
|
changed := false
|
|
for i, s := range sub {
|
|
upd, tmp := flattenRepetition(s)
|
|
changed = changed || upd
|
|
sub[i] = tmp
|
|
}
|
|
|
|
if !changed {
|
|
return false, r
|
|
}
|
|
|
|
// fix up top-level repetition with modified one
|
|
tmp := *r
|
|
if inRepetition {
|
|
for i := range r.Sub {
|
|
tmp.Sub[i] = sub[0]
|
|
}
|
|
} else {
|
|
tmp.Sub = sub
|
|
}
|
|
return changed, &tmp
|
|
}
|