Browse Source

major refactor of analysis files, now wired up to registry

ultimately this is make it more convenient for us to wire up
different elements of the analysis pipeline, without having to
preload everything into memory before we need it

separately the index layer now has a mechanism for storing
internal key/value pairs.  this is expected to be used to
store the mapping, and possibly other pieces of data by the
top layer, but not exposed to the user at the top.
Marty Schoch 4 years ago
parent
commit
c526a38369
100 changed files with 2635 additions and 89 deletions
  1. 44 0
      analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go
  2. 32 0
      analysis/analyzers/keyword_analyzer/keyword_analyzer.go
  3. 40 0
      analysis/analyzers/simple_analyzer/simple_analyzer.go
  4. 46 0
      analysis/analyzers/standard_analyzer/standard_analyzer.go
  5. 30 0
      analysis/char_filters/html_char_filter/html_char_filter.go
  6. 27 0
      analysis/char_filters/regexp_char_filter/regexp_char_filter.go
  7. 30 0
      analysis/char_filters/zero_width_non_joiner/zero_width_non_joiner_char_filter.go
  8. 39 0
      analysis/datetime_parsers/datetime_optional/datetime_optional.go
  9. 23 0
      analysis/datetime_parsers/flexible_go/flexible_go.go
  10. 0 1
      analysis/freq_test.go
  11. 12 1
      analysis/token_filters/arabic_normalize/arabic_normalize.go
  12. 1 1
      analysis/token_filters/arabic_normalize/arabic_normalize_test.go
  13. 10 20
      analysis/token_filters/stop_words_filter/stop_words_filter.go
  14. 18 1
      analysis/token_filters/stop_words_filter/stop_words_ar.go
  15. 27 0
      analysis/language/bg/stop_filter_bg.go
  16. 18 1
      analysis/token_filters/stop_words_filter/stop_words_bg.go
  17. 30 0
      analysis/language/ca/articles_ca.go
  18. 31 0
      analysis/language/ca/elision_ca.go
  19. 55 0
      analysis/language/ca/elision_ca_test.go
  20. 27 0
      analysis/language/ca/stop_filter_ca.go
  21. 18 1
      analysis/token_filters/stop_words_filter/stop_words_ca.go
  22. 55 0
      analysis/language/ckb/analyzer_ckb.go
  23. 12 1
      analysis/token_filters/sorani_normalize/sorani_normalize.go
  24. 1 1
      analysis/token_filters/sorani_normalize/sorani_normalize_test.go
  25. 12 1
      analysis/token_filters/sorani_stemmer_filter/sorani_stemmer_filter.go
  26. 2 3
      analysis/token_filters/sorani_stemmer_filter/sorani_stemmer_filter_test.go
  27. 27 0
      analysis/language/ckb/stop_filter_ckb.go
  28. 18 1
      analysis/token_filters/stop_words_filter/stop_words_ckb.go
  29. 27 0
      analysis/language/cs/stop_filter_cs.go
  30. 18 1
      analysis/token_filters/stop_words_filter/stop_words_cs.go
  31. 50 0
      analysis/language/da/analyzer_da.go
  32. 9 7
      analysis/tokenizers/rune_tokenizer/whitespace_classifier.go
  33. 27 0
      analysis/language/da/stop_filter_da.go
  34. 18 1
      analysis/token_filters/stop_words_filter/stop_words_da.go
  35. 55 0
      analysis/language/de/analyzer_de.go
  36. 12 1
      analysis/token_filters/german_normalize/german_normalize.go
  37. 1 1
      analysis/token_filters/german_normalize/german_normalize_test.go
  38. 25 0
      analysis/language/de/stemmer_de.go
  39. 27 0
      analysis/language/de/stop_filter_de.go
  40. 18 1
      analysis/token_filters/stop_words_filter/stop_words_de.go
  41. 27 0
      analysis/language/el/stop_filter_el.go
  42. 18 1
      analysis/token_filters/stop_words_filter/stop_words_el.go
  43. 51 0
      analysis/language/en/analyzer_en.go
  44. 25 0
      analysis/language/en/stemmer_en.go
  45. 69 0
      analysis/language/en/stemmer_en_test.go
  46. 27 0
      analysis/language/en/stop_filter_en.go
  47. 18 1
      analysis/token_filters/stop_words_filter/stop_words_en.go
  48. 51 0
      analysis/language/es/analyzer_es.go
  49. 25 0
      analysis/language/es/stemmer_es.go
  50. 27 0
      analysis/language/es/stop_filter_es.go
  51. 18 1
      analysis/token_filters/stop_words_filter/stop_words_es.go
  52. 27 0
      analysis/language/eu/stop_filter_eu.go
  53. 18 1
      analysis/token_filters/stop_words_filter/stop_words_eu.go
  54. 65 0
      analysis/language/fa/analyzer_fa.go
  55. 12 1
      analysis/token_filters/persian_normalize/persian_normalize.go
  56. 1 1
      analysis/token_filters/persian_normalize/persian_normalize_test.go
  57. 27 0
      analysis/language/fa/stop_filter_fa.go
  58. 18 1
      analysis/token_filters/stop_words_filter/stop_words_fa.go
  59. 51 0
      analysis/language/fi/analyzer_fi.go
  60. 25 0
      analysis/language/fi/stemmer_fi.go
  61. 27 0
      analysis/language/fi/stop_filter_fi.go
  62. 18 1
      analysis/token_filters/stop_words_filter/stop_words_fi.go
  63. 56 0
      analysis/language/fr/analyzer_fr.go
  64. 37 0
      analysis/language/fr/articles_fr.go
  65. 31 0
      analysis/language/fr/elision_fr.go
  66. 18 24
      analysis/tokenizers/rune_tokenizer/rune_tokenizer_test.go
  67. 25 0
      analysis/language/fr/stemmer_fr.go
  68. 27 0
      analysis/language/fr/stop_filter_fr.go
  69. 18 1
      analysis/token_filters/stop_words_filter/stop_words_fr.go
  70. 27 0
      analysis/language/ga/articles_ga.go
  71. 31 0
      analysis/language/ga/elision_ga.go
  72. 49 0
      analysis/language/ga/elision_ga_test.go
  73. 27 0
      analysis/language/ga/stop_filter_ga.go
  74. 18 1
      analysis/token_filters/stop_words_filter/stop_words_ga.go
  75. 27 0
      analysis/language/gl/stop_filter_gl.go
  76. 18 1
      analysis/token_filters/stop_words_filter/stop_words_gl.go
  77. 12 1
      analysis/token_filters/hindi_normalize/hindi_normalize.go
  78. 1 1
      analysis/token_filters/hindi_normalize/hindi_normalize_test.go
  79. 12 1
      analysis/token_filters/hindi_stemmer_filter/hindi_stemmer_filter.go
  80. 1 1
      analysis/token_filters/hindi_stemmer_filter/hindi_stemmer_filter_test.go
  81. 27 0
      analysis/language/hi/stop_filter_hi.go
  82. 18 1
      analysis/token_filters/stop_words_filter/stop_words_hi.go
  83. 51 0
      analysis/language/hu/analyzer_hu.go
  84. 25 0
      analysis/language/hu/stemmer_hu.go
  85. 27 0
      analysis/language/hu/stop_filter_hu.go
  86. 18 1
      analysis/token_filters/stop_words_filter/stop_words_hu.go
  87. 27 0
      analysis/language/hy/stop_filter_hy.go
  88. 18 1
      analysis/token_filters/stop_words_filter/stop_words_hy.go
  89. 27 0
      analysis/language/id/stop_filter_id.go
  90. 18 1
      analysis/token_filters/stop_words_filter/stop_words_id.go
  91. 56 0
      analysis/language/it/analyzer_it.go
  92. 45 0
      analysis/language/it/articles_it.go
  93. 31 0
      analysis/language/it/elision_it.go
  94. 49 0
      analysis/language/it/elision_it_test.go
  95. 25 0
      analysis/language/it/stemmer_it.go
  96. 27 0
      analysis/language/it/stop_filter_it.go
  97. 18 1
      analysis/token_filters/stop_words_filter/stop_words_it.go
  98. 51 0
      analysis/language/nl/analyzer_nl.go
  99. 25 0
      analysis/language/nl/stemmer_nl.go
  100. 0 0
      analysis/language/nl/stop_filter_nl.go

+ 44 - 0
analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go

@@ -0,0 +1,44 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package detect_lang_analyzer
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const Name = "detect_lang"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	keywordTokenizer, err := cache.TokenizerNamed("single")
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	detectLangFilter, err := cache.TokenFilterNamed("detect_lang")
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: keywordTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			detectLangFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(Name, AnalyzerConstructor)
+}

+ 32 - 0
analysis/analyzers/keyword_analyzer/keyword_analyzer.go

@@ -0,0 +1,32 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package keyword_analyzer
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/single_token"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const Name = "keyword"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: keywordTokenizer,
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(Name, AnalyzerConstructor)
+}

+ 40 - 0
analysis/analyzers/simple_analyzer/simple_analyzer.go

@@ -0,0 +1,40 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package simple_analyzer
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/whitespace_tokenizer"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const Name = "simple"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	keywordTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: keywordTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(Name, AnalyzerConstructor)
+}

+ 46 - 0
analysis/analyzers/standard_analyzer/standard_analyzer.go

@@ -0,0 +1,46 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package standard_analyzer
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/language/en"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/whitespace_tokenizer"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const Name = "standard"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	keywordTokenizer, err := cache.TokenizerNamed(whitespace_tokenizer.Name)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopEnFilter, err := cache.TokenFilterNamed(en.StopName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: keywordTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			stopEnFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(Name, AnalyzerConstructor)
+}

+ 30 - 0
analysis/char_filters/html_char_filter/html_char_filter.go

@@ -0,0 +1,30 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package html_char_filter
+
+import (
+	"regexp"
+
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const Name = "html"
+
+var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
+
+func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
+	replaceBytes := []byte(" ")
+	return regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, replaceBytes), nil
+}
+
+func init() {
+	registry.RegisterCharFilter(Name, CharFilterConstructor)
+}

+ 27 - 0
analysis/char_filters/regexp_char_filter/regexp_char_filter.go

@@ -10,9 +10,15 @@ package regexp_char_filter
 
 import (
 	"bytes"
+	"fmt"
 	"regexp"
+
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
+const Name = "regexp"
+
 type RegexpCharFilter struct {
 	r           *regexp.Regexp
 	replacement []byte
@@ -28,3 +34,24 @@ func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter
 func (s *RegexpCharFilter) Filter(input []byte) []byte {
 	return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) })
 }
+
+func RegexpCharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
+	regexpStr, ok := config["regexp"].(string)
+	if !ok {
+		return nil, fmt.Errorf("must specify regexp")
+	}
+	r, err := regexp.Compile(regexpStr)
+	if err != nil {
+		return nil, fmt.Errorf("unable to build regexp char filter: %v", err)
+	}
+	replaceBytes := []byte(" ")
+	replaceStr, ok := config["replace"].(string)
+	if ok {
+		replaceBytes = []byte(replaceStr)
+	}
+	return NewRegexpCharFilter(r, replaceBytes), nil
+}
+
+func init() {
+	registry.RegisterCharFilter(Name, RegexpCharFilterConstructor)
+}

+ 30 - 0
analysis/char_filters/zero_width_non_joiner/zero_width_non_joiner_char_filter.go

@@ -0,0 +1,30 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package zero_width_non_joiner
+
+import (
+	"regexp"
+
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const Name = "zero_width_spaces"
+
+var zeroWidthNonJoinerRegexp = regexp.MustCompile(`\x{200C}`)
+
+func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
+	replaceBytes := []byte(" ")
+	return regexp_char_filter.NewRegexpCharFilter(zeroWidthNonJoinerRegexp, replaceBytes), nil
+}
+
+func init() {
+	registry.RegisterCharFilter(Name, CharFilterConstructor)
+}

+ 39 - 0
analysis/datetime_parsers/datetime_optional/datetime_optional.go

@@ -0,0 +1,39 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package html_char_filter
+
+import (
+	"time"
+
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/datetime_parsers/flexible_go"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const Name = "dateTimeOptional"
+
+const rfc3339NoTimezone = "2006-01-02T15:04:05"
+const rfc3339NoTimezoneNoT = "2006-01-02 15:04:05"
+const rfc3339NoTime = "2006-01-02"
+
+var layouts = []string{
+	time.RFC3339Nano,
+	time.RFC3339,
+	rfc3339NoTimezone,
+	rfc3339NoTimezoneNoT,
+	rfc3339NoTime,
+}
+
+func DateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
+	return flexible_go.NewFlexibleGoDateTimeParser(layouts), nil
+}
+
+func init() {
+	registry.RegisterDateTimeParser(Name, DateTimeParserConstructor)
+}

+ 23 - 0
analysis/datetime_parsers/flexible_go/flexible_go.go

@@ -9,11 +9,15 @@
 package flexible_go
 
 import (
+	"fmt"
 	"time"
 
 	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
+const Name = "flexiblego"
+
 type FlexibleGoDateTimeParser struct {
 	layouts []string
 }
@@ -33,3 +37,22 @@ func (p *FlexibleGoDateTimeParser) ParseDateTime(input string) (time.Time, error
 	}
 	return time.Time{}, analysis.INVALID_DATETIME
 }
+
+func FlexibleGoDateTimeParserConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.DateTimeParser, error) {
+	layouts, ok := config["layouts"].([]interface{})
+	if !ok {
+		return nil, fmt.Errorf("must specify layouts")
+	}
+	layoutStrs := make([]string, 0)
+	for _, layout := range layouts {
+		layoutStr, ok := layout.(string)
+		if ok {
+			layoutStrs = append(layoutStrs, layoutStr)
+		}
+	}
+	return NewFlexibleGoDateTimeParser(layoutStrs), nil
+}
+
+func init() {
+	registry.RegisterDateTimeParser(Name, FlexibleGoDateTimeParserConstructor)
+}

+ 0 - 1
analysis/freq_test.go

@@ -162,6 +162,5 @@ func TestTokenFrequenciesMergeAllLeftEmpty(t *testing.T) {
 	result := tf1.MergeAll("tf2", tf2)
 	if !reflect.DeepEqual(result, expectedResult) {
 		t.Errorf("expected %#v, got %#v", expectedResult, result)
-		//t.Logf("%#v", tf1[0])
 	}
 }

+ 12 - 1
analysis/token_filters/arabic_normalize/arabic_normalize.go

@@ -6,14 +6,17 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package arabic_normalize
+package ar
 
 import (
 	"bytes"
 
 	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
+const NormalizeName = "normalize_ar"
+
 const (
 	ALEF             = '\u0627'
 	ALEF_MADDA       = '\u0622'
@@ -70,3 +73,11 @@ func normalize(input []byte) []byte {
 	}
 	return analysis.BuildTermFromRunes(runes)
 }
+
+func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewArabicNormalizeFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
+}

+ 1 - 1
analysis/token_filters/arabic_normalize/arabic_normalize_test.go

@@ -6,7 +6,7 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package arabic_normalize
+package ar
 
 import (
 	"reflect"

+ 10 - 20
analysis/token_filters/stop_words_filter/stop_words_filter.go

@@ -6,32 +6,22 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package stop_words_filter
+package ar
 
 import (
 	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
-type StopWordsFilter struct {
-	stopWords analysis.WordMap
-}
-
-func NewStopWordsFilter(stopWords analysis.WordMap) *StopWordsFilter {
-	return &StopWordsFilter{
-		stopWords: stopWords,
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
 	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
 }
 
-func (f *StopWordsFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
-	rv := make(analysis.TokenStream, 0)
-
-	for _, token := range input {
-		word := string(token.Term)
-		_, isStopWord := f.stopWords[word]
-		if !isStopWord {
-			rv = append(rv, token)
-		}
-	}
-
-	return rv
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
 }

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_ar.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package ar
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_ar"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
@@ -130,3 +137,13 @@ var ArabicStopWords = []byte(`# This file was created by Jacques Savoy and is di
 لدى
 جميع
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(ArabicStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 27 - 0
analysis/language/bg/stop_filter_bg.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package bg
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_bg.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package bg
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_bg"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@@ -198,3 +205,13 @@ var BulgarianStopWords = []byte(`# This file was created by Jacques Savoy and is
 щом
 я
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(BulgarianStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 30 - 0
analysis/language/ca/articles_ca.go

@@ -0,0 +1,30 @@
+package ca
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const ArticlesName = "articles_ca"
+
+// this content was obtained from:
+// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
+
+var CatalanArticles = []byte(`
+d
+l
+m
+n
+s
+t
+`)
+
+func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(CatalanArticles)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
+}

+ 31 - 0
analysis/language/ca/elision_ca.go

@@ -0,0 +1,31 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package ca
+
+import (
+	"fmt"
+
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const ElisionName = "elision_ca"
+
+func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
+	if err != nil {
+		return nil, fmt.Errorf("error building elision filter: %v", err)
+	}
+	return elision_filter.NewElisionFilter(articlesTokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
+}

+ 55 - 0
analysis/language/ca/elision_ca_test.go

@@ -0,0 +1,55 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package ca
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func TestFrenchElision(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("l'Institut"),
+				},
+				&analysis.Token{
+					Term: []byte("d'Estudis"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("Institut"),
+				},
+				&analysis.Token{
+					Term: []byte("Estudis"),
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	elisionFilter, err := cache.TokenFilterNamed(ElisionName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := elisionFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
+		}
+	}
+}

+ 27 - 0
analysis/language/ca/stop_filter_ca.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package ca
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_ca.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package ca
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_ca"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@@ -225,3 +232,13 @@ vostra
 vostre
 vostres
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(CatalanStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 55 - 0
analysis/language/ckb/analyzer_ckb.go

@@ -0,0 +1,55 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package ckb
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const AnalyzerName = "ckb"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
+	if err != nil {
+		return nil, err
+	}
+	normCkbFilter, err := cache.TokenFilterNamed(NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopCkbFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	stemmerCkbFilter, err := cache.TokenFilterNamed(StemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			normCkbFilter,
+			toLowerFilter,
+			stopCkbFilter,
+			stemmerCkbFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}

+ 12 - 1
analysis/token_filters/sorani_normalize/sorani_normalize.go

@@ -6,15 +6,18 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package sorani_normalize
+package ckb
 
 import (
 	"bytes"
 	"unicode"
 
 	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
+const NormalizeName = "normalize_ckb"
+
 const (
 	YEH         = '\u064A'
 	DOTLESS_YEH = '\u0649'
@@ -103,3 +106,11 @@ func normalize(input []byte) []byte {
 	}
 	return analysis.BuildTermFromRunes(runes)
 }
+
+func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewSoraniNormalizeFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
+}

+ 1 - 1
analysis/token_filters/sorani_normalize/sorani_normalize_test.go

@@ -6,7 +6,7 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package sorani_normalize
+package ckb
 
 import (
 	"reflect"

+ 12 - 1
analysis/token_filters/sorani_stemmer_filter/sorani_stemmer_filter.go

@@ -6,15 +6,18 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package sorani_stemmer_filter
+package ckb
 
 import (
 	"bytes"
 	"unicode/utf8"
 
 	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
+const StemmerName = "stemmer_ckb"
+
 type SoraniStemmerFilter struct {
 }
 
@@ -133,3 +136,11 @@ func buildTermFromRunes(runes []rune) []byte {
 	}
 	return rv
 }
+
+func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewSoraniStemmerFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
+}

+ 2 - 3
analysis/token_filters/sorani_stemmer_filter/sorani_stemmer_filter_test.go

@@ -6,14 +6,13 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package sorani_stemmer_filter
+package ckb
 
 import (
 	"reflect"
 	"testing"
 
 	"github.com/couchbaselabs/bleve/analysis"
-	"github.com/couchbaselabs/bleve/analysis/token_filters/sorani_normalize"
 	"github.com/couchbaselabs/bleve/analysis/tokenizers/single_token"
 )
 
@@ -24,7 +23,7 @@ func TestSoraniStemmerFilter(t *testing.T) {
 	analyzer := analysis.Analyzer{
 		Tokenizer: single_token.NewSingleTokenTokenizer(),
 		TokenFilters: []analysis.TokenFilter{
-			sorani_normalize.NewSoraniNormalizeFilter(),
+			NewSoraniNormalizeFilter(),
 			NewSoraniStemmerFilter(),
 		},
 	}

+ 27 - 0
analysis/language/ckb/stop_filter_ckb.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package ckb
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_ckb.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package ckb
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_ckb"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@@ -141,3 +148,13 @@ var SoraniStopWords = []byte(`# set of kurdish stopwords
 # like
 وەک
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(SoraniStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 27 - 0
analysis/language/cs/stop_filter_cs.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package cs
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_cs.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package cs
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_cs"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@@ -177,3 +184,13 @@ jež
 jakož
 načež
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(CzechStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 50 - 0
analysis/language/da/analyzer_da.go

@@ -0,0 +1,50 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package da
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const AnalyzerName = "da"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopDaFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	stemmerDaFilter, err := cache.TokenFilterNamed(StemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			stopDaFilter,
+			stemmerDaFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}

+ 9 - 7
analysis/tokenizers/rune_tokenizer/whitespace_classifier.go

@@ -6,18 +6,20 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package rune_tokenizer
+package da
 
 import (
-	"unicode"
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
-type WhitespaceClassifier struct{}
+const StemmerName = "stemmer_da"
 
-func NewWhitespaceClassifier() *WhitespaceClassifier {
-	return &WhitespaceClassifier{}
+func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return stemmer_filter.NewStemmerFilter("da")
 }
 
-func (c *WhitespaceClassifier) InToken(r rune) bool {
-	return !unicode.IsSpace(r)
+func init() {
+	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
 }

+ 27 - 0
analysis/language/da/stop_filter_da.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package da
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_da.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package da
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_da"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@@ -115,3 +122,13 @@ thi          | for (conj)
 jer          | you
 sådan        | such, like this/like that
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(DanishStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 55 - 0
analysis/language/de/analyzer_de.go

@@ -0,0 +1,55 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package de
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const AnalyzerName = "de"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopDeFilter, err := cache.TokenFilterNamed(NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	stemmerDeFilter, err := cache.TokenFilterNamed(StemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			stopDeFilter,
+			normalizeDeFilter,
+			stemmerDeFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}

+ 12 - 1
analysis/token_filters/german_normalize/german_normalize.go

@@ -6,14 +6,17 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package german_normalize
+package de
 
 import (
 	"bytes"
 
 	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
+const NormalizeName = "normalize_de"
+
 const (
 	N = 0 /* ordinary state */
 	V = 1 /* stops 'u' from entering umlaut state */
@@ -84,3 +87,11 @@ func normalize(input []byte) []byte {
 	}
 	return analysis.BuildTermFromRunes(runes)
 }
+
+func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewGermanNormalizeFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
+}

+ 1 - 1
analysis/token_filters/german_normalize/german_normalize_test.go

@@ -6,7 +6,7 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package german_normalize
+package de
 
 import (
 	"reflect"

+ 25 - 0
analysis/language/de/stemmer_de.go

@@ -0,0 +1,25 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package de
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StemmerName = "stemmer_de"
+
+func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return stemmer_filter.NewStemmerFilter("de")
+}
+
+func init() {
+	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
+}

+ 27 - 0
analysis/language/de/stop_filter_de.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package de
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_de.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package de
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_de"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@@ -299,3 +306,13 @@ zwar           |  indeed
 zwischen       |  between
 
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(GermanStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 27 - 0
analysis/language/el/stop_filter_el.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package el
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_el.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package el
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_el"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@@ -83,3 +90,13 @@ var GreekStopWords = []byte(`# Lucene Greek Stopwords list
 οσο
 οτι
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(GreekStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 51 - 0
analysis/language/en/analyzer_en.go

@@ -0,0 +1,51 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package en
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
+)
+
+const AnalyzerName = "en"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopEnFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	stemmerEnFilter, err := cache.TokenFilterNamed(StemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			stopEnFilter,
+			stemmerEnFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}

+ 25 - 0
analysis/language/en/stemmer_en.go

@@ -0,0 +1,25 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package en
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StemmerName = "stemmer_en"
+
+func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return stemmer_filter.NewStemmerFilter("en")
+}
+
+func init() {
+	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
+}

+ 69 - 0
analysis/language/en/stemmer_en_test.go

@@ -0,0 +1,69 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package en
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func TestEnglishStemmer(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("walking"),
+				},
+				&analysis.Token{
+					Term: []byte("talked"),
+				},
+				&analysis.Token{
+					Term: []byte("business"),
+				},
+				&analysis.Token{
+					Term:    []byte("protected"),
+					KeyWord: true,
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("walk"),
+				},
+				&analysis.Token{
+					Term: []byte("talk"),
+				},
+				&analysis.Token{
+					Term: []byte("busi"),
+				},
+				&analysis.Token{
+					Term:    []byte("protected"),
+					KeyWord: true,
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	stemmerFilter, err := cache.TokenFilterNamed(StemmerName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := stemmerFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
+		}
+	}
+}

+ 27 - 0
analysis/language/en/stop_filter_en.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package en
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_en.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package en
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_en"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@@ -324,3 +331,13 @@ very
     | high
     | long
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(EnglishStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 51 - 0
analysis/language/es/analyzer_es.go

@@ -0,0 +1,51 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package es
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
+)
+
+const AnalyzerName = "es"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopEsFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	stemmerEsFilter, err := cache.TokenFilterNamed(StemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			stopEsFilter,
+			stemmerEsFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}

+ 25 - 0
analysis/language/es/stemmer_es.go

@@ -0,0 +1,25 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package es
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StemmerName = "stemmer_es"
+
+func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return stemmer_filter.NewStemmerFilter("es")
+}
+
+func init() {
+	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
+}

+ 27 - 0
analysis/language/es/stop_filter_es.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package es
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_es.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package es
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_es"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@@ -361,3 +368,13 @@ tenidas
 tened
 
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(SpanishStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 27 - 0
analysis/language/eu/stop_filter_eu.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package eu
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_eu.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package eu
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_eu"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@@ -104,3 +111,13 @@ zuek
 zuen
 zuten
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(BasqueStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 65 - 0
analysis/language/fa/analyzer_fa.go

@@ -0,0 +1,65 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package fa
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+
+	"github.com/couchbaselabs/bleve/analysis/char_filters/zero_width_non_joiner"
+	"github.com/couchbaselabs/bleve/analysis/language/ar"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
+)
+
+const AnalyzerName = "fa"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	zFilter, err := cache.CharFilterNamed(zero_width_non_joiner.Name)
+	if err != nil {
+		return nil, err
+	}
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
+	if err != nil {
+		return nil, err
+	}
+	normArFilter, err := cache.TokenFilterNamed(ar.NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	normFaFilter, err := cache.TokenFilterNamed(NormalizeName)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopFaFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		CharFilters: []analysis.CharFilter{
+			zFilter,
+		},
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			normArFilter,
+			normFaFilter,
+			stopFaFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}

+ 12 - 1
analysis/token_filters/persian_normalize/persian_normalize.go

@@ -6,14 +6,17 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package persian_normalize
+package fa
 
 import (
 	"bytes"
 
 	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
+const NormalizeName = "normalize_fa"
+
 const (
 	YEH         = '\u064A'
 	FARSI_YEH   = '\u06CC'
@@ -62,3 +65,11 @@ func normalize(input []byte) []byte {
 	}
 	return analysis.BuildTermFromRunes(runes)
 }
+
+func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewPersianNormalizeFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
+}

+ 1 - 1
analysis/token_filters/persian_normalize/persian_normalize_test.go

@@ -6,7 +6,7 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package persian_normalize
+package fa
 
 import (
 	"reflect"

+ 27 - 0
analysis/language/fa/stop_filter_fa.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package fa
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_fa.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package fa
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_fa"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@@ -318,3 +325,13 @@ var PersianStopWords = []byte(`# This file was created by Jacques Savoy and is d
 عنوان
 بود
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(PersianStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 51 - 0
analysis/language/fi/analyzer_fi.go

@@ -0,0 +1,51 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package fi
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
+)
+
+const AnalyzerName = "fi"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopFiFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	stemmerFiFilter, err := cache.TokenFilterNamed(StemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			toLowerFilter,
+			stopFiFilter,
+			stemmerFiFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}

+ 25 - 0
analysis/language/fi/stemmer_fi.go

@@ -0,0 +1,25 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package fi
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StemmerName = "stemmer_fi"
+
+func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return stemmer_filter.NewStemmerFilter("fi")
+}
+
+func init() {
+	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
+}

+ 27 - 0
analysis/language/fi/stop_filter_fi.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package fi
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_fi.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package fi
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_fi"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@@ -102,3 +109,13 @@ nyt    | now
 itse   | self
 
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(FinnishStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 56 - 0
analysis/language/fr/analyzer_fr.go

@@ -0,0 +1,56 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package fr
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+
+	"github.com/couchbaselabs/bleve/analysis/token_filters/lower_case_filter"
+	"github.com/couchbaselabs/bleve/analysis/tokenizers/unicode_word_boundary"
+)
+
+const AnalyzerName = "fr"
+
+func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode_word_boundary.Name)
+	if err != nil {
+		return nil, err
+	}
+	elisionFilter, err := cache.TokenFilterNamed(ElisionName)
+	if err != nil {
+		return nil, err
+	}
+	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
+	if err != nil {
+		return nil, err
+	}
+	stopFrFilter, err := cache.TokenFilterNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	stemmerFrFilter, err := cache.TokenFilterNamed(StemmerName)
+	if err != nil {
+		return nil, err
+	}
+	rv := analysis.Analyzer{
+		Tokenizer: unicodeTokenizer,
+		TokenFilters: []analysis.TokenFilter{
+			elisionFilter,
+			toLowerFilter,
+			stopFrFilter,
+			stemmerFrFilter,
+		},
+	}
+	return &rv, nil
+}
+
+func init() {
+	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
+}

+ 37 - 0
analysis/language/fr/articles_fr.go

@@ -0,0 +1,37 @@
+package fr
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const ArticlesName = "articles_fr"
+
+// this content was obtained from:
+// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
+
+var FrenchArticles = []byte(`
+l
+m
+t
+qu
+n
+s
+j
+d
+c
+jusqu
+quoiqu
+lorsqu
+puisqu
+`)
+
+func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(FrenchArticles)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
+}

+ 31 - 0
analysis/language/fr/elision_fr.go

@@ -0,0 +1,31 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package fr
+
+import (
+	"fmt"
+
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const ElisionName = "elision_fr"
+
+func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
+	if err != nil {
+		return nil, fmt.Errorf("error building elision filter: %v", err)
+	}
+	return elision_filter.NewElisionFilter(articlesTokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
+}

+ 18 - 24
analysis/tokenizers/rune_tokenizer/rune_tokenizer_test.go

@@ -6,50 +6,44 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package rune_tokenizer
+package fr
 
 import (
 	"reflect"
 	"testing"
 
 	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
-func TestWhitespaceTokenizer(t *testing.T) {
-
-	classifier := NewWhitespaceClassifier()
-
+func TestFrenchElision(t *testing.T) {
 	tests := []struct {
-		input  []byte
+		input  analysis.TokenStream
 		output analysis.TokenStream
 	}{
 		{
-			[]byte("Hello World"),
-			analysis.TokenStream{
-				{
-					Start:    0,
-					End:      5,
-					Term:     []byte("Hello"),
-					Position: 1,
-					Type:     analysis.AlphaNumeric,
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("l'avion"),
 				},
-				{
-					Start:    6,
-					End:      11,
-					Term:     []byte("World"),
-					Position: 2,
-					Type:     analysis.AlphaNumeric,
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("avion"),
 				},
 			},
 		},
 	}
 
+	cache := registry.NewCache()
+	elisionFilter, err := cache.TokenFilterNamed(ElisionName)
+	if err != nil {
+		t.Fatal(err)
+	}
 	for _, test := range tests {
-		tokenizer := NewRuneTokenizer(classifier)
-		actual := tokenizer.Tokenize(test.input)
-
+		actual := elisionFilter.Filter(test.input)
 		if !reflect.DeepEqual(actual, test.output) {
-			t.Errorf("Expected %v, got %v for %s", test.output, actual, string(test.input))
+			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
 		}
 	}
 }

+ 25 - 0
analysis/language/fr/stemmer_fr.go

@@ -0,0 +1,25 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package fr
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stemmer_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StemmerName = "stemmer_fr"
+
+func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return stemmer_filter.NewStemmerFilter("fr")
+}
+
+func init() {
+	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
+}

+ 27 - 0
analysis/language/fr/stop_filter_fr.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package fr
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_fr.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package fr
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_fr"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@@ -191,3 +198,13 @@ sans           |  without
 soi            |  oneself
 
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(FrenchStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 27 - 0
analysis/language/ga/articles_ga.go

@@ -0,0 +1,27 @@
+package ga
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const ArticlesName = "articles_ga"
+
+// this content was obtained from:
+// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis
+
+var IrishArticles = []byte(`
+d
+m
+b
+`)
+
+func ArticlesTokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(IrishArticles)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(ArticlesName, ArticlesTokenMapConstructor)
+}

+ 31 - 0
analysis/language/ga/elision_ga.go

@@ -0,0 +1,31 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package ga
+
+import (
+	"fmt"
+
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/elision_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const ElisionName = "elision_ga"
+
+func ElisionFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	articlesTokenMap, err := cache.TokenMapNamed(ArticlesName)
+	if err != nil {
+		return nil, fmt.Errorf("error building elision filter: %v", err)
+	}
+	return elision_filter.NewElisionFilter(articlesTokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(ElisionName, ElisionFilterConstructor)
+}

+ 49 - 0
analysis/language/ga/elision_ga_test.go

@@ -0,0 +1,49 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package ga
+
+import (
+	"reflect"
+	"testing"
+
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func TestFrenchElision(t *testing.T) {
+	tests := []struct {
+		input  analysis.TokenStream
+		output analysis.TokenStream
+	}{
+		{
+			input: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("b'fhearr"),
+				},
+			},
+			output: analysis.TokenStream{
+				&analysis.Token{
+					Term: []byte("fhearr"),
+				},
+			},
+		},
+	}
+
+	cache := registry.NewCache()
+	elisionFilter, err := cache.TokenFilterNamed(ElisionName)
+	if err != nil {
+		t.Fatal(err)
+	}
+	for _, test := range tests {
+		actual := elisionFilter.Filter(test.input)
+		if !reflect.DeepEqual(actual, test.output) {
+			t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term)
+		}
+	}
+}

+ 27 - 0
analysis/language/ga/stop_filter_ga.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package ga
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_ga.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package ga
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_ga"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
@@ -115,3 +122,13 @@ um
 óna
 ónár
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(IrishStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 27 - 0
analysis/language/gl/stop_filter_gl.go

@@ -0,0 +1,27 @@
+//  Copyright (c) 2014 Couchbase, Inc.
+//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
+//  except in compliance with the License. You may obtain a copy of the License at
+//    http://www.apache.org/licenses/LICENSE-2.0
+//  Unless required by applicable law or agreed to in writing, software distributed under the
+//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
+//  either express or implied. See the License for the specific language governing permissions
+//  and limitations under the License.
+package gl
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/analysis/token_filters/stop_tokens_filter"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	tokenMap, err := cache.TokenMapNamed(StopName)
+	if err != nil {
+		return nil, err
+	}
+	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
+}

+ 18 - 1
analysis/token_filters/stop_words_filter/stop_words_gl.go

@@ -1,4 +1,11 @@
-package stop_words_filter
+package gl
+
+import (
+	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
+)
+
+const StopName = "stop_gl"
 
 // this content was obtained from:
 // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/
@@ -166,3 +173,13 @@ voso
 vosos
 vós
 `)
+
+func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
+	rv := analysis.NewTokenMap()
+	err := rv.LoadBytes(GalicianStopWords)
+	return rv, err
+}
+
+func init() {
+	registry.RegisterTokenMap(StopName, TokenMapConstructor)
+}

+ 12 - 1
analysis/token_filters/hindi_normalize/hindi_normalize.go

@@ -6,14 +6,17 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package hindi_normalize
+package hi
 
 import (
 	"bytes"
 
 	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
+const NormalizeName = "normalize_hi"
+
 type HindiNormalizeFilter struct {
 }
 
@@ -123,3 +126,11 @@ func normalize(input []byte) []byte {
 	}
 	return analysis.BuildTermFromRunes(runes)
 }
+
+func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewHindiNormalizeFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
+}

+ 1 - 1
analysis/token_filters/hindi_normalize/hindi_normalize_test.go

@@ -6,7 +6,7 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package hindi_normalize
+package hi
 
 import (
 	"reflect"

+ 12 - 1
analysis/token_filters/hindi_stemmer_filter/hindi_stemmer_filter.go

@@ -6,15 +6,18 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
-package hindi_stemmer_filter
+package hi
 
 import (
 	"bytes"
 	"unicode/utf8"
 
 	"github.com/couchbaselabs/bleve/analysis"
+	"github.com/couchbaselabs/bleve/registry"
 )
 
+const StemmerName = "stemmer_hi"
+
 type HindiStemmerFilter struct {
 }
 
@@ -134,3 +137,11 @@ func stem(input []byte) []byte {
 
 	return input
 }
+
+func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
+	return NewHindiStemmerFilter(), nil
+}
+
+func init() {
+	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
+}

+ 1 - 1
analysis/token_filters/hindi_stemmer_filter/hindi_stemmer_filter_test.go

@@ -6,7 +6,7 @@
 //  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
 //  either express or implied. See the License for the specific language governing permissions