Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve case insensitive search to avoid allocations. #4394

Merged
merged 5 commits into from
Nov 23, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 94 additions & 5 deletions pkg/logql/log/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@ import (
"fmt"
"regexp"
"regexp/syntax"
"unicode"
"unicode/utf8"

"github.com/prometheus/prometheus/pkg/labels"
"github.com/prometheus/prometheus/pkg/pool"
)

// Filterer is a interface to filter log lines.
Expand Down Expand Up @@ -161,16 +164,102 @@ func (r regexpFilter) ToStage() Stage {
}
}

var (
// BytesBufferPool is a bytes buffer used for lines decompressed.
// Buckets [0.5KB,1KB,2KB,4KB,8KB]
BytesBufferPool = pool.New(1<<9, 1<<13, 2, func(size int) interface{} { return make([]byte, 0, size) })

toLower = func(r rune) rune { return unicode.To(unicode.LowerCase, r) }
)

type containsFilter struct {
match []byte
caseInsensitive bool

buf []byte // reusable buffer for lowercase transformation
}

func (l *containsFilter) Filter(line []byte) bool {
if !l.caseInsensitive {
return bytes.Contains(line, l.match)
}
// verify if we have uppercase in the line and if it's only ascii chars
isASCII, hasUpper := true, false
for i := 0; i < len(line); i++ {
c := line[i]
if c >= utf8.RuneSelf {
isASCII = false
break
}
hasUpper = hasUpper || ('A' <= c && c <= 'Z')
}
if isASCII {
if !hasUpper {
return bytes.Contains(line, l.match)
}
return bytes.Contains(l.toLowerASCII(line), l.match)
}
return bytes.Contains(l.toLowerUnicode(line), l.match)
}

func (l containsFilter) Filter(line []byte) bool {
if l.caseInsensitive {
line = bytes.ToLower(line)
func (l *containsFilter) toLowerASCII(line []byte) []byte {
if len(line) > cap(l.buf) {
if l.buf != nil {
BytesBufferPool.Put(l.buf)
}
l.buf = BytesBufferPool.Get(len(line)).([]byte)[:len(line)]
}
for i := 0; i < len(line); i++ {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the buffer is being reused, don't we need to set l.buf l.buf[:len(line)]? Otherwise, it looks like it could include the end of a previous line that was longer. Alternatively, we could also return l.buf[:len(line)] at the end.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we do line 210 I think

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but that's only run when len(line) > cap(l.buf) (line 206). If buf is already cap=6, len=6 and line is len=5, we'll end up returning the whole len=6 buf at the end, despite only writing the first 5 indices. That seems like a bug to me; is there something I'm missing?

c := line[i]
if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
}
l.buf[i] = c
}
return l.buf
}

// toLowerUnicode returns a lowercased version of s, using the Unicode lower case function.
// this is inspired by `bytes.ToLower` but doesn't allocate a new buffer.
func (l *containsFilter) toLowerUnicode(line []byte) []byte {
// In the worst case, the slice can grow when mapped, making
// things unpleasant. But it's so rare we barge in assuming it's
// fine. It could also shrink but that falls out naturally.
nbytes := 0 // number of bytes encoded in b
if len(line) > cap(l.buf) {
if l.buf != nil {
BytesBufferPool.Put(l.buf)
}
l.buf = BytesBufferPool.Get(len(line)).([]byte)
l.buf = l.buf[:cap(l.buf)]
}
for i := 0; i < len(line); {
wid := 1
r := rune(line[i])
if r >= utf8.RuneSelf {
r, wid = utf8.DecodeRune(line[i:])
}
r = toLower(r)
if r >= 0 {
rl := utf8.RuneLen(r)
if rl < 0 {
rl = len(string(utf8.RuneError))
}
if nbytes+rl > cap(l.buf) {
// Grow the buffer.
nb := BytesBufferPool.Get(cap(l.buf)*2 + utf8.UTFMax).([]byte)
copy(nb, l.buf[0:nbytes])
if l.buf != nil {
BytesBufferPool.Put(l.buf)
}
l.buf = nb

}
nbytes += utf8.EncodeRune(l.buf[nbytes:cap(l.buf)], r)
}
i += wid
}
return bytes.Contains(line, l.match)
return l.buf[0:nbytes]
}

func (l containsFilter) ToStage() Stage {
Expand All @@ -193,7 +282,7 @@ func newContainsFilter(match []byte, caseInsensitive bool) Filterer {
if caseInsensitive {
match = bytes.ToLower(match)
}
return containsFilter{
return &containsFilter{
match: match,
caseInsensitive: caseInsensitive,
}
Expand Down
9 changes: 9 additions & 0 deletions pkg/logql/log/filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
func Test_SimplifiedRegex(t *testing.T) {
fixtures := []string{
"foo", "foobar", "bar", "foobuzz", "buzz", "f", " ", "fba", "foofoofoo", "b", "foob", "bfoo", "FoO",
"foo, 世界", allunicode(),
}
for _, test := range []struct {
re string
Expand Down Expand Up @@ -93,6 +94,14 @@ func Test_SimplifiedRegex(t *testing.T) {
}
}

func allunicode() string {
var b []byte
for i := 0x00; i <= 0x10FFFF; i++ {
b = append(b, byte(i))
}
return string(b)
}

func Test_TrueFilter(t *testing.T) {
empty := []byte("")
for _, test := range []struct {
Expand Down