Browse Source

fixes #1168 - filter duplicate locations in search results

This fix filters out duplicate locations using a destructive in-place
sort on the locations slice, and then a pass to remove duplicates.

This hopefully does not have undue impact on normal case performance,
as the deduplication effort happens only in the "last step" of
finalizeResults() and its DocumentMatch.Complete() invocations.

e.g., the deduplication will be only invoked on the final result hits
(e.g., "size": 10) instead of on all the (potentially many) candidate
document-matches which were examined beforehand.

Also, we only perform the deduplication effort only if we detect that
the locations that are being examined look like they're not in
increasing order.
Steve Yen 2 months ago
parent
commit
59adf0b315
4 changed files with 199 additions and 2 deletions
  1. 81 1
      search/search.go
  2. 76 0
      search/search_test.go
  3. 41 0
      search_test.go
  4. 1 1
      test/integration_test.go

+ 81 - 1
search/search.go

@@ -17,6 +17,7 @@ package search
 import (
 	"fmt"
 	"reflect"
+	"sort"
 
 	"github.com/blevesearch/bleve/index"
 	"github.com/blevesearch/bleve/size"
@@ -49,6 +50,24 @@ func (ap ArrayPositions) Equals(other ArrayPositions) bool {
 	return true
 }
 
+func (ap ArrayPositions) Compare(other ArrayPositions) int {
+	for i, p := range ap {
+		if i >= len(other) {
+			return 1
+		}
+		if p < other[i] {
+			return -1
+		}
+		if p > other[i] {
+			return 1
+		}
+	}
+	if len(ap) < len(other) {
+		return -1
+	}
+	return 0
+}
+
 type Location struct {
 	// Pos is the position of the term within the field, starting at 1
 	Pos uint64 `json:"pos"`
@@ -68,6 +87,46 @@ func (l *Location) Size() int {
 
 type Locations []*Location
 
+func (p Locations) Len() int      { return len(p) }
+func (p Locations) Swap(i, j int) { p[i], p[j] = p[j], p[i] }
+
+func (p Locations) Less(i, j int) bool {
+	c := p[i].ArrayPositions.Compare(p[j].ArrayPositions)
+	if c < 0 {
+		return true
+	}
+	if c > 0 {
+		return false
+	}
+	return p[i].Pos < p[j].Pos
+}
+
+func (p Locations) Dedupe() Locations { // destructive!
+	if len(p) <= 1 {
+		return p
+	}
+
+	sort.Sort(p)
+
+	slow := 0
+
+	for _, pfast := range p {
+		pslow := p[slow]
+		if pslow.Pos == pfast.Pos &&
+			pslow.Start == pfast.Start &&
+			pslow.End == pfast.End &&
+			pslow.ArrayPositions.Equals(pfast.ArrayPositions) {
+			continue // duplicate, so only move fast ahead
+		}
+
+		slow++
+
+		p[slow] = pfast
+	}
+
+	return p[:slow+1]
+}
+
 type TermLocationMap map[string]Locations
 
 func (t TermLocationMap) AddLocation(term string, location *Location) {
@@ -208,6 +267,7 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
 
 		var lastField string
 		var tlm TermLocationMap
+		var needsDedupe bool
 
 		for i, ftl := range dm.FieldTermLocations {
 			if lastField != ftl.Field {
@@ -231,7 +291,19 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
 				loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...)
 			}
 
-			tlm[ftl.Term] = append(tlm[ftl.Term], loc)
+			locs := tlm[ftl.Term]
+
+			// if the loc is before or at the last location, then there
+			// might be duplicates that need to be deduplicated
+			if !needsDedupe && len(locs) > 0 {
+				last := locs[len(locs)-1]
+				cmp := loc.ArrayPositions.Compare(last.ArrayPositions)
+				if cmp < 0 || (cmp == 0 && loc.Pos <= last.Pos) {
+					needsDedupe = true
+				}
+			}
+
+			tlm[ftl.Term] = append(locs, loc)
 
 			dm.FieldTermLocations[i] = FieldTermLocation{ // recycle
 				Location: Location{
@@ -239,6 +311,14 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location {
 				},
 			}
 		}
+
+		if needsDedupe {
+			for _, tlm := range dm.Locations {
+				for term, locs := range tlm {
+					tlm[term] = locs.Dedupe()
+				}
+			}
+		}
 	}
 
 	dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle

+ 76 - 0
search/search_test.go

@@ -0,0 +1,76 @@
+//  Copyright (c) 2019 Couchbase, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// 		http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package search
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestArrayPositionsCompare(t *testing.T) {
+	tests := []struct {
+		a      []uint64
+		b      []uint64
+		expect int
+	}{
+		{nil, nil, 0},
+		{[]uint64{}, []uint64{}, 0},
+		{[]uint64{1}, []uint64{}, 1},
+		{[]uint64{1}, []uint64{1}, 0},
+		{[]uint64{}, []uint64{1}, -1},
+		{[]uint64{0}, []uint64{1}, -1},
+		{[]uint64{1}, []uint64{0}, 1},
+		{[]uint64{1}, []uint64{1, 2}, -1},
+		{[]uint64{1, 2}, []uint64{1}, 1},
+		{[]uint64{1, 2}, []uint64{1, 2}, 0},
+		{[]uint64{1, 2}, []uint64{1, 200}, -1},
+		{[]uint64{1, 2}, []uint64{100, 2}, -1},
+		{[]uint64{1, 2}, []uint64{1, 2, 3}, -1},
+	}
+
+	for _, test := range tests {
+		res := ArrayPositions(test.a).Compare(test.b)
+		if res != test.expect {
+			t.Errorf("test: %+v, res: %v", test, res)
+		}
+	}
+}
+
+func TestLocationsDedupe(t *testing.T) {
+	a := &Location{}
+	b := &Location{Pos: 1}
+	c := &Location{Pos: 2}
+
+	tests := []struct {
+		input  Locations
+		expect Locations
+	}{
+		{Locations{}, Locations{}},
+		{Locations{a}, Locations{a}},
+		{Locations{a, b, c}, Locations{a, b, c}},
+		{Locations{a, a}, Locations{a}},
+		{Locations{a, a, a}, Locations{a}},
+		{Locations{a, b}, Locations{a, b}},
+		{Locations{b, a}, Locations{a, b}},
+		{Locations{c, b, a, c, b, a, c, b, a}, Locations{a, b, c}},
+	}
+
+	for testi, test := range tests {
+		res := test.input.Dedupe()
+		if !reflect.DeepEqual(res, test.expect) {
+			t.Errorf("testi: %d, test: %+v, res: %+v", testi, test, res)
+		}
+	}
+}

+ 41 - 0
search_test.go

@@ -36,6 +36,7 @@ import (
 	"github.com/blevesearch/bleve/index/upsidedown"
 	"github.com/blevesearch/bleve/mapping"
 	"github.com/blevesearch/bleve/search"
+	"github.com/blevesearch/bleve/search/highlight/highlighter/html"
 	"github.com/blevesearch/bleve/search/query"
 )
 
@@ -1223,3 +1224,43 @@ func TestDisjunctionMinPropagation(t *testing.T) {
 		t.Fatalf("Expect 0 results, but got: %v", res.Total)
 	}
 }
+
+func TestDuplicateLocationsIssue1168(t *testing.T) {
+	fm1 := NewTextFieldMapping()
+	fm1.Analyzer = keyword.Name
+	fm1.Name = "name1"
+
+	dm := NewDocumentStaticMapping()
+	dm.AddFieldMappingsAt("name", fm1)
+
+	m := NewIndexMapping()
+	m.DefaultMapping = dm
+
+	idx, err := NewMemOnly(m)
+	if err != nil {
+		t.Fatalf("bleve new err: %v", err)
+	}
+
+	err = idx.Index("x", map[string]interface{}{
+		"name": "marty",
+	})
+	if err != nil {
+		t.Fatalf("bleve index err: %v", err)
+	}
+
+	q1 := NewTermQuery("marty")
+	q2 := NewTermQuery("marty")
+	dq := NewDisjunctionQuery(q1, q2)
+
+	sreq := NewSearchRequest(dq)
+	sreq.Fields = []string{"*"}
+	sreq.Highlight = NewHighlightWithStyle(html.Name)
+
+	sres, err := idx.Search(sreq)
+	if err != nil {
+		t.Fatalf("bleve search err: %v", err)
+	}
+	if len(sres.Hits[0].Locations["name1"]["marty"]) != 1 {
+		t.Fatalf("duplicate marty")
+	}
+}

+ 1 - 1
test/integration_test.go

@@ -187,7 +187,7 @@ func runTestDir(t *testing.T, dir, datasetName string) {
 				if hit.Locations != nil {
 					if !reflect.DeepEqual(hit.Locations, res.Hits[hi].Locations) {
 						t.Errorf("test error - %s", search.Comment)
-						t.Errorf("test %d - expected hit %d to have locations %v got %v", testNum, hi, hit.Locations, res.Hits[hi].Locations)
+						t.Errorf("test %d - expected hit %d to have locations %#v got %#v", testNum, hi, hit.Locations, res.Hits[hi].Locations)
 					}
 				}
 				// assert that none of the scores were NaN,+Inf,-Inf