Browse Source

MB-33617: Supporting the "reverse" token filter

This token filter will simply reverse each token.
abhinavdangeti 1 month ago
parent
commit
4e0f481955
3 changed files with 230 additions and 0 deletions
  1. 66 0
      analysis/token/reverse/reverse.go
  2. 163 0
      analysis/token/reverse/reverse_test.go
  3. 1 0
      config/config.go

+ 66 - 0
analysis/token/reverse/reverse.go

@@ -0,0 +1,66 @@
+//  Copyright (c) 2019 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package reverse
+
+import (
+	"unicode/utf8"
+
+	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/registry"
+)
+
+// Name is the name used to register ReverseFilter in the bleve registry
+const Name = "reverse"
+
+type ReverseFilter struct {
+}
+
+func NewReverseFilter() *ReverseFilter {
+	return &ReverseFilter{}
+}
+
+func (f *ReverseFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
+	for _, token := range input {
+		token.Term = reverse(token.Term)
+	}
+	return input
+}
+
+func ReverseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewReverseFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(Name, ReverseFilterConstructor)
+}
+
+// reverse(..) will generate a reversed version of the provided
+// utf-8 encoded byte array and return it back to its caller.
+func reverse(s []byte) []byte {
+	j := len(s)
+	rv := make([]byte, len(s))
+	for i := 0; i < len(s); {
+		wid := 1
+		r := rune(s[i])
+		if r >= utf8.RuneSelf {
+			r, wid = utf8.DecodeRune(s[i:])
+		}
+
+		copy(rv[j-wid:j], s[i:i+wid])
+		i += wid
+		j -= wid
+	}
+	return rv
+}

+ 163 - 0
analysis/token/reverse/reverse_test.go

@@ -0,0 +1,163 @@
+//  Copyright (c) 2019 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package reverse
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/blevesearch/bleve/analysis"
+)
+
+func TestReverseFilter(t *testing.T) {
+	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("one"),
+		},
+		&analysis.Token{
+			Term: []byte("TWo"),
+		},
+		&analysis.Token{
+			Term: []byte("thRee"),
+		},
+		&analysis.Token{
+			Term: []byte("four's"),
+		},
+		&analysis.Token{
+			Term: []byte("what's this in reverse"),
+		},
+		&analysis.Token{
+			Term: []byte("œ∑´®†"),
+		},
+		&analysis.Token{
+			Term: []byte("İȺȾCAT÷≥≤µ123"),
+		},
+		&analysis.Token{
+			Term: []byte("!@#$%^&*()"),
+		},
+		&analysis.Token{},
+	}
+
+	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("eno"),
+		},
+		&analysis.Token{
+			Term: []byte("oWT"),
+		},
+		&analysis.Token{
+			Term: []byte("eeRht"),
+		},
+		&analysis.Token{
+			Term: []byte("s'ruof"),
+		},
+		&analysis.Token{
+			Term: []byte("esrever ni siht s'tahw"),
+		},
+		&analysis.Token{
+			Term: []byte("†®´∑œ"),
+		},
+		&analysis.Token{
+			Term: []byte("321µ≤≥÷TACȾȺİ"),
+		},
+		&analysis.Token{
+			Term: []byte(")(*&^%$#@!"),
+		},
+		&analysis.Token{},
+	}
+
+	filter := NewReverseFilter()
+	outputTokenStream := filter.Filter(inputTokenStream)
+	for i := 0; i < len(expectedTokenStream); i++ {
+		if !bytes.Equal(outputTokenStream[i].Term, expectedTokenStream[i].Term) {
+			t.Errorf("[%d] expected %s got %s",
+				i+1, expectedTokenStream[i].Term, outputTokenStream[i].Term)
+		}
+	}
+}
+
+func BenchmarkReverseFilter(b *testing.B) {
+	input := analysis.TokenStream{
+		&analysis.Token{
+			Term: []byte("A"),
+		},
+		&analysis.Token{
+			Term: []byte("boiling"),
+		},
+		&analysis.Token{
+			Term: []byte("liquid"),
+		},
+		&analysis.Token{
+			Term: []byte("expanding"),
+		},
+		&analysis.Token{
+			Term: []byte("vapor"),
+		},
+		&analysis.Token{
+			Term: []byte("explosion"),
+		},
+		&analysis.Token{
+			Term: []byte("caused"),
+		},
+		&analysis.Token{
+			Term: []byte("by"),
+		},
+		&analysis.Token{
+			Term: []byte("the"),
+		},
+		&analysis.Token{
+			Term: []byte("rupture"),
+		},
+		&analysis.Token{
+			Term: []byte("of"),
+		},
+		&analysis.Token{
+			Term: []byte("a"),
+		},
+		&analysis.Token{
+			Term: []byte("vessel"),
+		},
+		&analysis.Token{
+			Term: []byte("containing"),
+		},
+		&analysis.Token{
+			Term: []byte("pressurized"),
+		},
+		&analysis.Token{
+			Term: []byte("liquid"),
+		},
+		&analysis.Token{
+			Term: []byte("above"),
+		},
+		&analysis.Token{
+			Term: []byte("its"),
+		},
+		&analysis.Token{
+			Term: []byte("boiling"),
+		},
+		&analysis.Token{
+			Term: []byte("point"),
+		},
+		&analysis.Token{
+			Term: []byte("İȺȾCAT"),
+		},
+	}
+	filter := NewReverseFilter()
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		filter.Filter(input)
+	}
+}

+ 1 - 0
config/config.go

@@ -52,6 +52,7 @@ import (
 	_ "github.com/blevesearch/bleve/analysis/token/length"
 	_ "github.com/blevesearch/bleve/analysis/token/lowercase"
 	_ "github.com/blevesearch/bleve/analysis/token/ngram"
+	_ "github.com/blevesearch/bleve/analysis/token/reverse"
 	_ "github.com/blevesearch/bleve/analysis/token/shingle"
 	_ "github.com/blevesearch/bleve/analysis/token/stop"
 	_ "github.com/blevesearch/bleve/analysis/token/truncate"