Browse Source

optimize TermFacets (#1404)

* optimize TermFacets

The existing TermFacet implementation was O(n) when adding
new TermFacets. All TermFacets were stored in a slice and
the code iterated over the entire slice to determine if
a term already existed.

In MultiSearch, TermFacets for searches across each Index
were merged together, resulting in several iterations over
the same slice.

This change adds a map to `TermFacets` so that existence
checks are done in O(1).

* add go.sum to .gitignore
Tyler Kovacs 10 months ago
parent
commit
17d02632cc
6 changed files with 145 additions and 81 deletions
  1. 1 0
      .gitignore
  2. 2 2
      examples_test.go
  3. 1 1
      search.go
  4. 6 6
      search/facet/facet_builder_terms.go
  5. 69 22
      search/facets_builder.go
  6. 66 50
      search/facets_builder_test.go

+ 1 - 0
.gitignore

@@ -17,3 +17,4 @@ vendor/**
 /search/query/y.output
 *.test
 tags
+go.sum

+ 2 - 2
examples_test.go

@@ -269,7 +269,7 @@ func ExampleNewFacetRequest() {
 	// numer of docs with no value for this field
 	fmt.Println(searchResults.Facets["facet name"].Missing)
 	// term with highest occurrences in field name
-	fmt.Println(searchResults.Facets["facet name"].Terms[0].Term)
+	fmt.Println(searchResults.Facets["facet name"].Terms.Terms()[0].Term)
 	// Output:
 	// 5
 	// 2
@@ -355,7 +355,7 @@ func ExampleSearchRequest_AddFacet() {
 	// numer of docs with no value for this field
 	fmt.Println(searchResults.Facets["facet name"].Missing)
 	// term with highest occurrences in field name
-	fmt.Println(searchResults.Facets["facet name"].Terms[0].Term)
+	fmt.Println(searchResults.Facets["facet name"].Terms.Terms()[0].Term)
 	// Output:
 	// 5
 	// 2

+ 1 - 1
search.go

@@ -543,7 +543,7 @@ func (sr *SearchResult) String() string {
 		rv += fmt.Sprintf("Facets:\n")
 		for fn, f := range sr.Facets {
 			rv += fmt.Sprintf("%s(%d)\n", fn, f.Total)
-			for _, t := range f.Terms {
+			for _, t := range f.Terms.Terms() {
 				rv += fmt.Sprintf("\t%s(%d)\n", t.Term, t.Count)
 			}
 			if f.Other != 0 {

+ 6 - 6
search/facet/facet_builder_terms.go

@@ -87,7 +87,7 @@ func (fb *TermsFacetBuilder) Result() *search.FacetResult {
 		Missing: fb.missing,
 	}
 
-	rv.Terms = make([]*search.TermFacet, 0, len(fb.termsCount))
+	rv.Terms = &search.TermFacets{}
 
 	for term, count := range fb.termsCount {
 		tf := &search.TermFacet{
@@ -95,20 +95,20 @@ func (fb *TermsFacetBuilder) Result() *search.FacetResult {
 			Count: count,
 		}
 
-		rv.Terms = append(rv.Terms, tf)
+		rv.Terms.Add(tf)
 	}
 
 	sort.Sort(rv.Terms)
 
 	// we now have the list of the top N facets
 	trimTopN := fb.size
-	if trimTopN > len(rv.Terms) {
-		trimTopN = len(rv.Terms)
+	if trimTopN > rv.Terms.Len() {
+		trimTopN = rv.Terms.Len()
 	}
-	rv.Terms = rv.Terms[:trimTopN]
+	rv.Terms.TrimToTopN(trimTopN)
 
 	notOther := 0
-	for _, tf := range rv.Terms {
+	for _, tf := range rv.Terms.Terms() {
 		notOther += tf.Count
 	}
 	rv.Other = fb.total - notOther

+ 69 - 22
search/facets_builder.go

@@ -15,6 +15,7 @@
 package search
 
 import (
+	"encoding/json"
 	"reflect"
 	"sort"
 
@@ -112,27 +113,73 @@ type TermFacet struct {
 	Count int    `json:"count"`
 }
 
-type TermFacets []*TermFacet
+type TermFacets struct {
+	termFacets []*TermFacet
+	termLookup map[string]*TermFacet
+}
+
+func (tf *TermFacets) Terms() []*TermFacet {
+	return tf.termFacets
+}
 
-func (tf TermFacets) Add(termFacet *TermFacet) TermFacets {
-	for _, existingTerm := range tf {
-		if termFacet.Term == existingTerm.Term {
-			existingTerm.Count += termFacet.Count
-			return tf
+func (tf *TermFacets) TrimToTopN(n int) {
+	tf.termFacets = tf.termFacets[:n]
+}
+
+func (tf *TermFacets) Add(termFacets ...*TermFacet) {
+	for _, termFacet := range termFacets {
+		if tf.termLookup == nil {
+			tf.termLookup = map[string]*TermFacet{}
 		}
+
+		if term, ok := tf.termLookup[termFacet.Term]; ok {
+			term.Count += termFacet.Count
+			return
+		}
+
+		// if we got here it wasn't already in the existing terms
+		tf.termFacets = append(tf.termFacets, termFacet)
+		tf.termLookup[termFacet.Term] = termFacet
 	}
-	// if we got here it wasn't already in the existing terms
-	tf = append(tf, termFacet)
-	return tf
 }
 
-func (tf TermFacets) Len() int      { return len(tf) }
-func (tf TermFacets) Swap(i, j int) { tf[i], tf[j] = tf[j], tf[i] }
-func (tf TermFacets) Less(i, j int) bool {
-	if tf[i].Count == tf[j].Count {
-		return tf[i].Term < tf[j].Term
+func (tf *TermFacets) Len() int {
+	// Handle case where *TermFacets is not fully initialized in index_impl.go.init()
+	if tf == nil {
+		return 0
 	}
-	return tf[i].Count > tf[j].Count
+
+	return len(tf.termFacets)
+}
+func (tf *TermFacets) Swap(i, j int) {
+	tf.termFacets[i], tf.termFacets[j] = tf.termFacets[j], tf.termFacets[i]
+}
+func (tf *TermFacets) Less(i, j int) bool {
+	if tf.termFacets[i].Count == tf.termFacets[j].Count {
+		return tf.termFacets[i].Term < tf.termFacets[j].Term
+	}
+	return tf.termFacets[i].Count > tf.termFacets[j].Count
+}
+
+// TermFacets used to be a type alias for []*TermFacet.
+// To maintain backwards compatibility, we have to implement custom
+// JSON marshalling.
+func (tf *TermFacets) MarshalJSON() ([]byte, error) {
+	return json.Marshal(tf.termFacets)
+}
+
+func (tf *TermFacets) UnmarshalJSON(b []byte) error {
+	termFacets := []*TermFacet{}
+	err := json.Unmarshal(b, &termFacets)
+	if err != nil {
+		return err
+	}
+
+	for _, termFacet := range termFacets {
+		tf.Add(termFacet)
+	}
+
+	return nil
 }
 
 type NumericRangeFacet struct {
@@ -246,7 +293,7 @@ type FacetResult struct {
 	Total         int                `json:"total"`
 	Missing       int                `json:"missing"`
 	Other         int                `json:"other"`
-	Terms         TermFacets         `json:"terms,omitempty"`
+	Terms         *TermFacets        `json:"terms,omitempty"`
 	NumericRanges NumericRangeFacets `json:"numeric_ranges,omitempty"`
 	DateRanges    DateRangeFacets    `json:"date_ranges,omitempty"`
 }
@@ -254,7 +301,7 @@ type FacetResult struct {
 func (fr *FacetResult) Size() int {
 	return reflectStaticSizeFacetResult + size.SizeOfPtr +
 		len(fr.Field) +
-		len(fr.Terms)*(reflectStaticSizeTermFacet+size.SizeOfPtr) +
+		fr.Terms.Len()*(reflectStaticSizeTermFacet+size.SizeOfPtr) +
 		len(fr.NumericRanges)*(reflectStaticSizeNumericRangeFacet+size.SizeOfPtr) +
 		len(fr.DateRanges)*(reflectStaticSizeDateRangeFacet+size.SizeOfPtr)
 }
@@ -264,8 +311,8 @@ func (fr *FacetResult) Merge(other *FacetResult) {
 	fr.Missing += other.Missing
 	fr.Other += other.Other
 	if fr.Terms != nil && other.Terms != nil {
-		for _, term := range other.Terms {
-			fr.Terms = fr.Terms.Add(term)
+		for _, term := range other.Terms.termFacets {
+			fr.Terms.Add(term)
 		}
 	}
 	if fr.NumericRanges != nil && other.NumericRanges != nil {
@@ -283,12 +330,12 @@ func (fr *FacetResult) Merge(other *FacetResult) {
 func (fr *FacetResult) Fixup(size int) {
 	if fr.Terms != nil {
 		sort.Sort(fr.Terms)
-		if len(fr.Terms) > size {
-			moveToOther := fr.Terms[size:]
+		if fr.Terms.Len() > size {
+			moveToOther := fr.Terms.termFacets[size:]
 			for _, mto := range moveToOther {
 				fr.Other += mto.Count
 			}
-			fr.Terms = fr.Terms[0:size]
+			fr.Terms.termFacets = fr.Terms.termFacets[0:size]
 		}
 	} else if fr.NumericRanges != nil {
 		sort.Sort(fr.NumericRanges)

+ 66 - 50
search/facets_builder_test.go

@@ -21,66 +21,75 @@ import (
 
 func TestTermFacetResultsMerge(t *testing.T) {
 
+	fr1TypeTerms := &TermFacets{}
+	fr1TypeTerms.Add(
+		&TermFacet{
+			Term:  "blog",
+			Count: 25,
+		},
+		&TermFacet{
+			Term:  "comment",
+			Count: 24,
+		},
+		&TermFacet{
+			Term:  "feedback",
+			Count: 1,
+		},
+	)
 	fr1 := &FacetResult{
 		Field:   "type",
 		Total:   100,
 		Missing: 25,
 		Other:   25,
-		Terms: []*TermFacet{
-			{
-				Term:  "blog",
-				Count: 25,
-			},
-			{
-				Term:  "comment",
-				Count: 24,
-			},
-			{
-				Term:  "feedback",
-				Count: 1,
-			},
-		},
+		Terms:   fr1TypeTerms,
 	}
+
+	fr1CategoryTerms := &TermFacets{}
+	fr1CategoryTerms.Add(
+		&TermFacet{
+			Term:  "clothing",
+			Count: 35,
+		},
+		&TermFacet{
+			Term:  "electronics",
+			Count: 25,
+		},
+	)
+
 	fr1Only := &FacetResult{
 		Field:   "category",
 		Total:   97,
 		Missing: 22,
 		Other:   15,
-		Terms: []*TermFacet{
-			{
-				Term:  "clothing",
-				Count: 35,
-			},
-			{
-				Term:  "electronics",
-				Count: 25,
-			},
-		},
+		Terms:   fr1CategoryTerms,
 	}
 	frs1 := FacetResults{
 		"types":      fr1,
 		"categories": fr1Only,
 	}
 
+	fr2TypeTerms := &TermFacets{}
+	fr2TypeTerms.Add(
+		&TermFacet{
+			Term:  "blog",
+			Count: 25,
+		},
+		&TermFacet{
+			Term:  "comment",
+			Count: 22,
+		},
+		&TermFacet{
+			Term:  "flag",
+			Count: 3,
+		},
+	)
+
 	fr2 := &FacetResult{
 		Field:   "type",
 		Total:   100,
 		Missing: 25,
 		Other:   25,
-		Terms: []*TermFacet{
-			{
-				Term:  "blog",
-				Count: 25,
-			},
-			{
-				Term:  "comment",
-				Count: 22,
-			},
-			{
-				Term:  "flag",
-				Count: 3,
-			},
-		},
+		Terms:   fr2TypeTerms,
 	}
 	frs2 := FacetResults{
 		"types": fr2,
@@ -91,18 +100,20 @@ func TestTermFacetResultsMerge(t *testing.T) {
 		Total:   200,
 		Missing: 50,
 		Other:   51,
-		Terms: []*TermFacet{
-			{
-				Term:  "blog",
-				Count: 50,
-			},
-			{
-				Term:  "comment",
-				Count: 46,
-			},
-			{
-				Term:  "flag",
-				Count: 3,
+		Terms: &TermFacets{
+			termFacets: []*TermFacet{
+				{
+					Term:  "blog",
+					Count: 50,
+				},
+				{
+					Term:  "comment",
+					Count: 46,
+				},
+				{
+					Term:  "flag",
+					Count: 3,
+				},
 			},
 		},
 	}
@@ -113,6 +124,11 @@ func TestTermFacetResultsMerge(t *testing.T) {
 
 	frs1.Merge(frs2)
 	frs1.Fixup("types", 3)
+
+	for _, v := range frs1 {
+		v.Terms.termLookup = nil
+	}
+
 	if !reflect.DeepEqual(frs1, expectedFrs) {
 		t.Errorf("expected %v, got %v", expectedFrs, frs1)
 	}