Browse Source

MB-28847: Account for total documents' size within a batch

+ Supporting APIs to fetch these stats: last added
  document's size and total documents' size.
abhinavdangeti 10 months ago
parent
commit
bdd917bb12

+ 12 - 4
document/document.go

@@ -43,10 +43,18 @@ func NewDocument(id string) *Document {
 }
 
 func (d *Document) Size() int {
-	return reflectStaticSizeDocument + size.SizeOfPtr +
-		len(d.ID) +
-		len(d.Fields)*size.SizeOfPtr +
-		len(d.CompositeFields)*(size.SizeOfPtr+reflectStaticSizeCompositeField)
+	sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr +
+		len(d.ID)
+
+	for _, entry := range d.Fields {
+		sizeInBytes += entry.Size()
+	}
+
+	for _, entry := range d.CompositeFields {
+		sizeInBytes += entry.Size()
+	}
+
+	return sizeInBytes
 }
 
 func (d *Document) AddField(f Field) *Document {

+ 2 - 0
document/field.go

@@ -36,4 +36,6 @@ type Field interface {
 	// that this field represents - this is a common metric for tracking
 	// the rate of indexing
 	NumPlainTextBytes() uint64
+
+	Size() int
 }

+ 16 - 0
document/field_boolean.go

@@ -16,10 +16,19 @@ package document
 
 import (
 	"fmt"
+	"reflect"
 
 	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/size"
 )
 
+var reflectStaticSizeBooleanField int
+
+func init() {
+	var f BooleanField
+	reflectStaticSizeBooleanField = int(reflect.TypeOf(f).Size())
+}
+
 const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues
 
 type BooleanField struct {
@@ -30,6 +39,13 @@ type BooleanField struct {
 	numPlainTextBytes uint64
 }
 
+func (b *BooleanField) Size() int {
+	return reflectStaticSizeBooleanField + size.SizeOfPtr +
+		len(b.name) +
+		len(b.arrayPositions)*size.SizeOfUint64 +
+		len(b.value)
+}
+
 func (b *BooleanField) Name() string {
 	return b.name
 }

+ 16 - 0
document/field_composite.go

@@ -18,6 +18,7 @@ import (
 	"reflect"
 
 	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/size"
 )
 
 var reflectStaticSizeCompositeField int
@@ -63,6 +64,21 @@ func NewCompositeFieldWithIndexingOptions(name string, defaultInclude bool, incl
 	return rv
 }
 
+func (c *CompositeField) Size() int {
+	sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr +
+		len(c.name)
+
+	for k, _ := range c.includedFields {
+		sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
+	}
+
+	for k, _ := range c.excludedFields {
+		sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool
+	}
+
+	return sizeInBytes
+}
+
 func (c *CompositeField) Name() string {
 	return c.name
 }

+ 15 - 0
document/field_datetime.go

@@ -17,12 +17,21 @@ package document
 import (
 	"fmt"
 	"math"
+	"reflect"
 	"time"
 
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/numeric"
+	"github.com/blevesearch/bleve/size"
 )
 
+var reflectStaticSizeDateTimeField int
+
+func init() {
+	var f DateTimeField
+	reflectStaticSizeDateTimeField = int(reflect.TypeOf(f).Size())
+}
+
 const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues
 const DefaultDateTimePrecisionStep uint = 4
 
@@ -37,6 +46,12 @@ type DateTimeField struct {
 	numPlainTextBytes uint64
 }
 
+func (n *DateTimeField) Size() int {
+	return reflectStaticSizeDateTimeField + size.SizeOfPtr +
+		len(n.name) +
+		len(n.arrayPositions)*size.SizeOfUint64
+}
+
 func (n *DateTimeField) Name() string {
 	return n.name
 }

+ 15 - 0
document/field_geopoint.go

@@ -16,12 +16,21 @@ package document
 
 import (
 	"fmt"
+	"reflect"
 
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/geo"
 	"github.com/blevesearch/bleve/numeric"
+	"github.com/blevesearch/bleve/size"
 )
 
+var reflectStaticSizeGeoPointField int
+
+func init() {
+	var f GeoPointField
+	reflectStaticSizeGeoPointField = int(reflect.TypeOf(f).Size())
+}
+
 var GeoPrecisionStep uint = 9
 
 type GeoPointField struct {
@@ -32,6 +41,12 @@ type GeoPointField struct {
 	numPlainTextBytes uint64
 }
 
+func (n *GeoPointField) Size() int {
+	return reflectStaticSizeGeoPointField + size.SizeOfPtr +
+		len(n.name) +
+		len(n.arrayPositions)*size.SizeOfUint64
+}
+
 func (n *GeoPointField) Name() string {
 	return n.name
 }

+ 15 - 0
document/field_numeric.go

@@ -16,11 +16,20 @@ package document
 
 import (
 	"fmt"
+	"reflect"
 
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/numeric"
+	"github.com/blevesearch/bleve/size"
 )
 
+var reflectStaticSizeNumericField int
+
+func init() {
+	var f NumericField
+	reflectStaticSizeNumericField = int(reflect.TypeOf(f).Size())
+}
+
 const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues
 
 const DefaultPrecisionStep uint = 4
@@ -33,6 +42,12 @@ type NumericField struct {
 	numPlainTextBytes uint64
 }
 
+func (n *NumericField) Size() int {
+	return reflectStaticSizeNumericField + size.SizeOfPtr +
+		len(n.name) +
+		len(n.arrayPositions)*size.SizeOfPtr
+}
+
 func (n *NumericField) Name() string {
 	return n.name
 }

+ 16 - 0
document/field_text.go

@@ -16,10 +16,19 @@ package document
 
 import (
 	"fmt"
+	"reflect"
 
 	"github.com/blevesearch/bleve/analysis"
+	"github.com/blevesearch/bleve/size"
 )
 
+var reflectStaticSizeTextField int
+
+func init() {
+	var f TextField
+	reflectStaticSizeTextField = int(reflect.TypeOf(f).Size())
+}
+
 const DefaultTextIndexingOptions = IndexField | DocValues
 
 type TextField struct {
@@ -31,6 +40,13 @@ type TextField struct {
 	numPlainTextBytes uint64
 }
 
+func (t *TextField) Size() int {
+	return reflectStaticSizeTextField + size.SizeOfPtr +
+		len(t.name) +
+		len(t.arrayPositions)*size.SizeOfUint64 +
+		len(t.value)
+}
+
 func (t *TextField) Name() string {
 	return t.name
 }

+ 17 - 0
index.go

@@ -21,6 +21,7 @@ import (
 	"github.com/blevesearch/bleve/index"
 	"github.com/blevesearch/bleve/index/store"
 	"github.com/blevesearch/bleve/mapping"
+	"github.com/blevesearch/bleve/size"
 )
 
 // A Batch groups together multiple Index and Delete
@@ -32,6 +33,9 @@ import (
 type Batch struct {
 	index    Index
 	internal *index.Batch
+
+	lastDocSize uint64
+	totalSize   uint64
 }
 
 // Index adds the specified index operation to the
@@ -47,9 +51,22 @@ func (b *Batch) Index(id string, data interface{}) error {
 		return err
 	}
 	b.internal.Update(doc)
+
+	b.lastDocSize = uint64(doc.Size() +
+		len(id) + size.SizeOfString) // overhead from internal
+	b.totalSize += b.lastDocSize
+
 	return nil
 }
 
+func (b *Batch) LastDocSize() uint64 {
+	return b.lastDocSize
+}
+
+func (b *Batch) TotalDocsSize() uint64 {
+	return b.totalSize
+}
+
 // IndexAdvanced adds the specified index operation to the
 // batch which skips the mapping.  NOTE: the bleve Index is not updated
 // until the batch is executed.