Browse Source

MB-33617: Support unicode strings with combining characters

Context: reverse-token-filter
abhinavdangeti 2 months ago
parent
commit
7ca4ea2cd8
2 changed files with 45 additions and 15 deletions
  1. 22 13
      analysis/token/reverse/reverse.go
  2. 23 2
      analysis/token/reverse/reverse_test.go

+ 22 - 13
analysis/token/reverse/reverse.go

@@ -15,6 +15,7 @@
 package reverse
 
 import (
+	"unicode"
 	"unicode/utf8"
 
 	"github.com/blevesearch/bleve/analysis"
@@ -47,20 +48,28 @@ func init() {
 }
 
 // reverse(..) will generate a reversed version of the provided
-// utf-8 encoded byte array and return it back to its caller.
+// unicode array and return it back to its caller.
 func reverse(s []byte) []byte {
-	j := len(s)
-	rv := make([]byte, len(s))
-	for i := 0; i < len(s); {
-		wid := 1
-		r := rune(s[i])
-		if r >= utf8.RuneSelf {
-			r, wid = utf8.DecodeRune(s[i:])
+	cursorIn := 0
+	inputRunes := []rune(string(s))
+	cursorOut := len(s)
+	output := make([]byte, len(s))
+	for i := 0; i < len(inputRunes); {
+		wid := utf8.RuneLen(inputRunes[i])
+		i++
+		for i < len(inputRunes) {
+			r := inputRunes[i]
+			if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Me, r) || unicode.Is(unicode.Mc, r) {
+				wid += utf8.RuneLen(r)
+				i++
+			} else {
+				break
+			}
 		}
-
-		copy(rv[j-wid:j], s[i:i+wid])
-		i += wid
-		j -= wid
+		copy(output[cursorOut-wid:cursorOut], s[cursorIn:cursorIn+wid])
+		cursorIn += wid
+		cursorOut -= wid
 	}
-	return rv
+
+	return output
 }

+ 23 - 2
analysis/token/reverse/reverse_test.go

@@ -23,6 +23,7 @@ import (
 
 func TestReverseFilter(t *testing.T) {
 	inputTokenStream := analysis.TokenStream{
+		&analysis.Token{},
 		&analysis.Token{
 			Term: []byte("one"),
 		},
@@ -47,10 +48,19 @@ func TestReverseFilter(t *testing.T) {
 		&analysis.Token{
 			Term: []byte("!@#$%^&*()"),
 		},
-		&analysis.Token{},
+		&analysis.Token{
+			Term: []byte("cafés"),
+		},
+		&analysis.Token{
+			Term: []byte("¿Dónde estás?"),
+		},
+		&analysis.Token{
+			Term: []byte("Me gustaría una cerveza."),
+		},
 	}
 
 	expectedTokenStream := analysis.TokenStream{
+		&analysis.Token{},
 		&analysis.Token{
 			Term: []byte("eno"),
 		},
@@ -75,7 +85,15 @@ func TestReverseFilter(t *testing.T) {
 		&analysis.Token{
 			Term: []byte(")(*&^%$#@!"),
 		},
-		&analysis.Token{},
+		&analysis.Token{
+			Term: []byte("séfac"),
+		},
+		&analysis.Token{
+			Term: []byte("?sátse ednóD¿"),
+		},
+		&analysis.Token{
+			Term: []byte(".azevrec anu aíratsug eM"),
+		},
 	}
 
 	filter := NewReverseFilter()
@@ -153,6 +171,9 @@ func BenchmarkReverseFilter(b *testing.B) {
 		&analysis.Token{
 			Term: []byte("İȺȾCAT"),
 		},
+		&analysis.Token{
+			Term: []byte("Me gustaría una cerveza."),
+		},
 	}
 	filter := NewReverseFilter()