Browse Source

major refactor of bleve configuration

see #221 for full details
Marty Schoch 4 years ago
parent
commit
f81b2be334
100 changed files with 158 additions and 6290 deletions
  1. 0 49
      analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go
  2. 3 1
      analysis/byte_array_converters/ignore/ignore_byte_array_converter.go
  3. 3 1
      analysis/byte_array_converters/json/json_byte_array_converter.go
  4. 3 1
      analysis/byte_array_converters/string/string_byte_array_conveter.go
  5. 1 1
      analysis/datetime_parsers/datetime_optional/datetime_optional.go
  6. 3 5
      analysis/language/ckb/analyzer_ckb.go
  7. 0 2
      analysis/language/ckb/analyzer_ckb_test.go
  8. 0 54
      analysis/language/da/analyzer_da.go
  9. 0 69
      analysis/language/da/analyzer_da_test.go
  10. 0 28
      analysis/language/da/stemmer_da.go
  11. 0 28
      analysis/language/da/stop_filter_da.go
  12. 0 134
      analysis/language/da/stop_words_da.go
  13. 0 59
      analysis/language/de/analyzer_de.go
  14. 0 97
      analysis/language/de/analyzer_de_test.go
  15. 0 94
      analysis/language/de/german_normalize.go
  16. 0 98
      analysis/language/de/german_normalize_test.go
  17. 0 28
      analysis/language/de/stemmer_de.go
  18. 0 28
      analysis/language/de/stop_filter_de.go
  19. 0 318
      analysis/language/de/stop_words_de.go
  20. 0 28
      analysis/language/en/stemmer_en.go
  21. 0 72
      analysis/language/en/stemmer_en_test.go
  22. 0 55
      analysis/language/es/analyzer_es.go
  23. 0 64
      analysis/language/es/analyzer_es_test.go
  24. 0 28
      analysis/language/es/stemmer_es.go
  25. 0 28
      analysis/language/es/stop_filter_es.go
  26. 0 380
      analysis/language/es/stop_words_es.go
  27. 3 5
      analysis/language/fa/analyzer_fa.go
  28. 0 2
      analysis/language/fa/analyzer_fa_test.go
  29. 0 55
      analysis/language/fi/analyzer_fi.go
  30. 0 68
      analysis/language/fi/analyzer_fi_test.go
  31. 0 28
      analysis/language/fi/stemmer_fi.go
  32. 0 28
      analysis/language/fi/stop_filter_fi.go
  33. 0 121
      analysis/language/fi/stop_words_fi.go
  34. 0 28
      analysis/language/fr/stemmer_fr.go
  35. 0 55
      analysis/language/hu/analyzer_hu.go
  36. 0 68
      analysis/language/hu/analyzer_hu_test.go
  37. 0 28
      analysis/language/hu/stemmer_hu.go
  38. 0 28
      analysis/language/hu/stop_filter_hu.go
  39. 0 235
      analysis/language/hu/stop_words_hu.go
  40. 0 28
      analysis/language/it/stemmer_it.go
  41. 0 2
      analysis/language/ja/analyzer_ja.go
  42. 0 2
      analysis/language/ja/analyzer_ja_test.go
  43. 0 2
      analysis/language/ja/ja_morph_kagome.go
  44. 0 2
      analysis/language/ja/ja_morph_kagome_test.go
  45. 0 55
      analysis/language/nl/analyzer_nl.go
  46. 0 68
      analysis/language/nl/analyzer_nl_test.go
  47. 0 28
      analysis/language/nl/stemmer_nl.go
  48. 0 28
      analysis/language/nl/stop_filter_nl.go
  49. 0 143
      analysis/language/nl/stop_words_nl.go
  50. 0 55
      analysis/language/no/analyzer_no.go
  51. 0 68
      analysis/language/no/analyzer_no_test.go
  52. 0 28
      analysis/language/no/stemmer_no.go
  53. 0 28
      analysis/language/no/stop_filter_no.go
  54. 0 218
      analysis/language/no/stop_words_no.go
  55. 0 28
      analysis/language/porter/stemmer_porter.go
  56. 0 28
      analysis/language/pt/stemmer_pt.go
  57. 0 55
      analysis/language/ro/analyzer_ro.go
  58. 0 68
      analysis/language/ro/analyzer_ro_test.go
  59. 0 28
      analysis/language/ro/stemmer_ro.go
  60. 0 28
      analysis/language/ro/stop_filter_ro.go
  61. 0 257
      analysis/language/ro/stop_words_ro.go
  62. 0 55
      analysis/language/ru/analyzer_ru.go
  63. 0 98
      analysis/language/ru/analyzer_ru_test.go
  64. 0 28
      analysis/language/ru/stemmer_ru.go
  65. 0 28
      analysis/language/ru/stop_filter_ru.go
  66. 0 267
      analysis/language/ru/stop_words_ru.go
  67. 0 55
      analysis/language/sv/analyzer_sv.go
  68. 0 68
      analysis/language/sv/analyzer_sv_test.go
  69. 0 28
      analysis/language/sv/stemmer_sv.go
  70. 0 28
      analysis/language/sv/stop_filter_sv.go
  71. 0 157
      analysis/language/sv/stop_words_sv.go
  72. 0 48
      analysis/language/th/analyzer_th.go
  73. 0 119
      analysis/language/th/analyzer_th_test.go
  74. 0 28
      analysis/language/th/stop_filter_th.go
  75. 0 143
      analysis/language/th/stop_words_th.go
  76. 0 28
      analysis/language/th/unicode_tokenizer_th.go
  77. 0 61
      analysis/language/tr/analyzer_tr.go
  78. 0 88
      analysis/language/tr/analyzer_tr_test.go
  79. 0 28
      analysis/language/tr/stemmer_tr.go
  80. 0 28
      analysis/language/tr/stop_filter_tr.go
  81. 0 236
      analysis/language/tr/stop_words_tr.go
  82. 0 33
      analysis/token_filters/cld2/README.md
  83. 0 44
      analysis/token_filters/cld2/cld2_filter.cc
  84. 0 67
      analysis/token_filters/cld2/cld2_filter.go
  85. 0 18
      analysis/token_filters/cld2/cld2_filter.h
  86. 0 123
      analysis/token_filters/cld2/cld2_filter_test.go
  87. 0 10
      analysis/token_filters/cld2/compile_cld2.sh
  88. 0 18
      analysis/token_filters/stemmer_filter/README.md
  89. 0 80
      analysis/token_filters/stemmer_filter/stemmer_filter.go
  90. 0 63
      analysis/token_filters/stemmer_filter/stemmer_filter_test.go
  91. 0 138
      analysis/tokenizers/icu/boundary.go
  92. 0 191
      analysis/tokenizers/icu/boundary_test.go
  93. 6 129
      config.go
  94. 11 0
      config/README.md
  95. 98 0
      config/config.go
  96. 16 0
      config/config_cld2.go
  97. 4 4
      config_cznicb.go
  98. 4 4
      config_forestdb.go
  99. 3 3
      config_icu.go
  100. 0 0
      config_kagome.go

+ 0 - 49
analysis/analyzers/detect_lang_analyzer/detect_lang_analyzer.go

@@ -1,49 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build cld2 full
-
-package detect_lang_analyzer
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/cld2"
-	"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
-	"github.com/blevesearch/bleve/analysis/tokenizers/single_token"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const Name = "detect_lang"
-
-func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
-	keywordTokenizer, err := cache.TokenizerNamed(single_token.Name)
-	if err != nil {
-		return nil, err
-	}
-	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
-	if err != nil {
-		return nil, err
-	}
-	detectLangFilter, err := cache.TokenFilterNamed(cld2.Name)
-	if err != nil {
-		return nil, err
-	}
-	rv := analysis.Analyzer{
-		Tokenizer: keywordTokenizer,
-		TokenFilters: []analysis.TokenFilter{
-			toLowerFilter,
-			detectLangFilter,
-		},
-	}
-	return &rv, nil
-}
-
-func init() {
-	registry.RegisterAnalyzer(Name, AnalyzerConstructor)
-}

+ 3 - 1
analysis/byte_array_converters/ignore/ignore_byte_array_converter.go

@@ -14,6 +14,8 @@ import (
 	"github.com/blevesearch/bleve/registry"
 )
 
+const Name = "ignore"
+
 type IgnoreByteArrayConverter struct{}
 
 func NewIgnoreByteArrayConverter() *IgnoreByteArrayConverter {
@@ -29,5 +31,5 @@ func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis
 }
 
 func init() {
-	registry.RegisterByteArrayConverter("ignore", Constructor)
+	registry.RegisterByteArrayConverter(Name, Constructor)
 }

+ 3 - 1
analysis/byte_array_converters/json/json_byte_array_converter.go

@@ -16,6 +16,8 @@ import (
 	"github.com/blevesearch/bleve/registry"
 )
 
+const Name = "json"
+
 type JSONByteArrayConverter struct{}
 
 func NewJSONByteArrayConverter() *JSONByteArrayConverter {
@@ -36,5 +38,5 @@ func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis
 }
 
 func init() {
-	registry.RegisterByteArrayConverter("json", Constructor)
+	registry.RegisterByteArrayConverter(Name, Constructor)
 }

+ 3 - 1
analysis/byte_array_converters/string/string_byte_array_conveter.go

@@ -14,6 +14,8 @@ import (
 	"github.com/blevesearch/bleve/registry"
 )
 
+const Name = "string"
+
 type StringByteArrayConverter struct{}
 
 func NewStringByteArrayConverter() *StringByteArrayConverter {
@@ -29,5 +31,5 @@ func Constructor(config map[string]interface{}, cache *registry.Cache) (analysis
 }
 
 func init() {
-	registry.RegisterByteArrayConverter("string", Constructor)
+	registry.RegisterByteArrayConverter(Name, Constructor)
 }

+ 1 - 1
analysis/datetime_parsers/datetime_optional/datetime_optional.go

@@ -7,7 +7,7 @@
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 
-package html_char_filter
+package datetime_optional
 
 import (
 	"time"

+ 3 - 5
analysis/language/ckb/analyzer_ckb.go

@@ -7,21 +7,19 @@
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 
-// +build icu full
-
 package ckb
 
 import (
 	"github.com/blevesearch/bleve/analysis"
 	"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
-	"github.com/blevesearch/bleve/analysis/tokenizers/icu"
+	"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
 	"github.com/blevesearch/bleve/registry"
 )
 
 const AnalyzerName = "ckb"
 
 func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
-	icuTokenizer, err := cache.TokenizerNamed(icu.Name)
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
 	if err != nil {
 		return nil, err
 	}
@@ -42,7 +40,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
 		return nil, err
 	}
 	rv := analysis.Analyzer{
-		Tokenizer: icuTokenizer,
+		Tokenizer: unicodeTokenizer,
 		TokenFilters: []analysis.TokenFilter{
 			normCkbFilter,
 			toLowerFilter,

+ 0 - 2
analysis/language/ckb/analyzer_ckb_test.go

@@ -7,8 +7,6 @@
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 
-// +build icu full
-
 package ckb
 
 import (

+ 0 - 54
analysis/language/da/analyzer_da.go

@@ -1,54 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package da
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
-	"github.com/blevesearch/bleve/analysis/tokenizers/icu"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const AnalyzerName = "da"
-
-func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
-	icuTokenizer, err := cache.TokenizerNamed(icu.Name)
-	if err != nil {
-		return nil, err
-	}
-	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
-	if err != nil {
-		return nil, err
-	}
-	stopDaFilter, err := cache.TokenFilterNamed(StopName)
-	if err != nil {
-		return nil, err
-	}
-	stemmerDaFilter, err := cache.TokenFilterNamed(StemmerName)
-	if err != nil {
-		return nil, err
-	}
-	rv := analysis.Analyzer{
-		Tokenizer: icuTokenizer,
-		TokenFilters: []analysis.TokenFilter{
-			toLowerFilter,
-			stopDaFilter,
-			stemmerDaFilter,
-		},
-	}
-	return &rv, nil
-}
-
-func init() {
-	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
-}

+ 0 - 69
analysis/language/da/analyzer_da_test.go

@@ -1,69 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package da
-
-import (
-	"reflect"
-	"testing"
-
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func TestDanishAnalyzer(t *testing.T) {
-	tests := []struct {
-		input  []byte
-		output analysis.TokenStream
-	}{
-		// stemming
-		{
-			input: []byte("undersøg"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term:     []byte("undersøg"),
-					Position: 1,
-					Start:    0,
-					End:      9,
-				},
-			},
-		},
-		{
-			input: []byte("undersøgelse"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term:     []byte("undersøg"),
-					Position: 1,
-					Start:    0,
-					End:      13,
-				},
-			},
-		},
-		// stop word
-		{
-			input:  []byte("på"),
-			output: analysis.TokenStream{},
-		},
-	}
-
-	cache := registry.NewCache()
-	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
-	if err != nil {
-		t.Fatal(err)
-	}
-	for _, test := range tests {
-		actual := analyzer.Analyze(test.input)
-		if !reflect.DeepEqual(actual, test.output) {
-			t.Errorf("expected %v, got %v", test.output, actual)
-		}
-	}
-}

+ 0 - 28
analysis/language/da/stemmer_da.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-
-package da
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StemmerName = "stemmer_da"
-
-func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	return stemmer_filter.NewStemmerFilter("da")
-}
-
-func init() {
-	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
-}

+ 0 - 28
analysis/language/da/stop_filter_da.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-package da
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	tokenMap, err := cache.TokenMapNamed(StopName)
-	if err != nil {
-		return nil, err
-	}
-	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
-}
-
-func init() {
-	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
-}

+ 0 - 134
analysis/language/da/stop_words_da.go

@@ -1,134 +0,0 @@
-package da
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StopName = "stop_da"
-
-// this content was obtained from:
-// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
-// ` was changed to ' to allow for literal string
-
-var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt
- | This file is distributed under the BSD License.
- | See http://snowball.tartarus.org/license.php
- | Also see http://www.opensource.org/licenses/bsd-license.html
- |  - Encoding was converted to UTF-8.
- |  - This notice was added.
- |
- | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
-
- | A Danish stop word list. Comments begin with vertical bar. Each stop
- | word is at the start of a line.
-
- | This is a ranked list (commonest to rarest) of stopwords derived from
- | a large text sample.
-
-
-og           | and
-i            | in
-jeg          | I
-det          | that (dem. pronoun)/it (pers. pronoun)
-at           | that (in front of a sentence)/to (with infinitive)
-en           | a/an
-den          | it (pers. pronoun)/that (dem. pronoun)
-til          | to/at/for/until/against/by/of/into, more
-er           | present tense of "to be"
-som          | who, as
-på           | on/upon/in/on/at/to/after/of/with/for, on
-de           | they
-med          | with/by/in, along
-han          | he
-af           | of/by/from/off/for/in/with/on, off
-for          | at/for/to/from/by/of/ago, in front/before, because
-ikke         | not
-der          | who/which, there/those
-var          | past tense of "to be"
-mig          | me/myself
-sig          | oneself/himself/herself/itself/themselves
-men          | but
-et           | a/an/one, one (number), someone/somebody/one
-har          | present tense of "to have"
-om           | round/about/for/in/a, about/around/down, if
-vi           | we
-min          | my
-havde        | past tense of "to have"
-ham          | him
-hun          | she
-nu           | now
-over         | over/above/across/by/beyond/past/on/about, over/past
-da           | then, when/as/since
-fra          | from/off/since, off, since
-du           | you
-ud           | out
-sin          | his/her/its/one's
-dem          | them
-os           | us/ourselves
-op           | up
-man          | you/one
-hans         | his
-hvor         | where
-eller        | or
-hvad         | what
-skal         | must/shall etc.
-selv         | myself/youself/herself/ourselves etc., even
-her          | here
-alle         | all/everyone/everybody etc.
-vil          | will (verb)
-blev         | past tense of "to stay/to remain/to get/to become"
-kunne        | could
-ind          | in
-når          | when
-være         | present tense of "to be"
-dog          | however/yet/after all
-noget        | something
-ville        | would
-jo           | you know/you see (adv), yes
-deres        | their/theirs
-efter        | after/behind/according to/for/by/from, later/afterwards
-ned          | down
-skulle       | should
-denne        | this
-end          | than
-dette        | this
-mit          | my/mine
-også         | also
-under        | under/beneath/below/during, below/underneath
-have         | have
-dig          | you
-anden        | other
-hende        | her
-mine         | my
-alt          | everything
-meget        | much/very, plenty of
-sit          | his, her, its, one's
-sine         | his, her, its, one's
-vor          | our
-mod          | against
-disse        | these
-hvis         | if
-din          | your/yours
-nogle        | some
-hos          | by/at
-blive        | be/become
-mange        | many
-ad           | by/through
-bliver       | present tense of "to be/to become"
-hendes       | her/hers
-været        | be
-thi          | for (conj)
-jer          | you
-sådan        | such, like this/like that
-`)
-
-func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
-	rv := analysis.NewTokenMap()
-	err := rv.LoadBytes(DanishStopWords)
-	return rv, err
-}
-
-func init() {
-	registry.RegisterTokenMap(StopName, TokenMapConstructor)
-}

+ 0 - 59
analysis/language/de/analyzer_de.go

@@ -1,59 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package de
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
-	"github.com/blevesearch/bleve/analysis/tokenizers/icu"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const AnalyzerName = "de"
-
-func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
-	icuTokenizer, err := cache.TokenizerNamed(icu.Name)
-	if err != nil {
-		return nil, err
-	}
-	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
-	if err != nil {
-		return nil, err
-	}
-	stopDeFilter, err := cache.TokenFilterNamed(NormalizeName)
-	if err != nil {
-		return nil, err
-	}
-	normalizeDeFilter, err := cache.TokenFilterNamed(NormalizeName)
-	if err != nil {
-		return nil, err
-	}
-	stemmerDeFilter, err := cache.TokenFilterNamed(StemmerName)
-	if err != nil {
-		return nil, err
-	}
-	rv := analysis.Analyzer{
-		Tokenizer: icuTokenizer,
-		TokenFilters: []analysis.TokenFilter{
-			toLowerFilter,
-			stopDeFilter,
-			normalizeDeFilter,
-			stemmerDeFilter,
-		},
-	}
-	return &rv, nil
-}
-
-func init() {
-	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
-}

+ 0 - 97
analysis/language/de/analyzer_de_test.go

@@ -1,97 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package de
-
-import (
-	"reflect"
-	"testing"
-
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func TestGermanAnalyzer(t *testing.T) {
-	tests := []struct {
-		input  []byte
-		output analysis.TokenStream
-	}{
-		{
-			input: []byte("Tisch"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term:     []byte("tisch"),
-					Position: 1,
-					Start:    0,
-					End:      5,
-				},
-			},
-		},
-		{
-			input: []byte("Tische"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term:     []byte("tisch"),
-					Position: 1,
-					Start:    0,
-					End:      6,
-				},
-			},
-		},
-		{
-			input: []byte("Tischen"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term:     []byte("tisch"),
-					Position: 1,
-					Start:    0,
-					End:      7,
-				},
-			},
-		},
-		// german specials
-		{
-			input: []byte("Schaltflächen"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term:     []byte("schaltflach"),
-					Position: 1,
-					Start:    0,
-					End:      14,
-				},
-			},
-		},
-		{
-			input: []byte("Schaltflaechen"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term:     []byte("schaltflach"),
-					Position: 1,
-					Start:    0,
-					End:      14,
-				},
-			},
-		},
-	}
-
-	cache := registry.NewCache()
-	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
-	if err != nil {
-		t.Fatal(err)
-	}
-	for _, test := range tests {
-		actual := analyzer.Analyze(test.input)
-		if !reflect.DeepEqual(actual, test.output) {
-			t.Errorf("expected %v, got %v", test.output, actual)
-		}
-	}
-}

+ 0 - 94
analysis/language/de/german_normalize.go

@@ -1,94 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-package de
-
-import (
-	"bytes"
-
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const NormalizeName = "normalize_de"
-
-const (
-	N = 0 /* ordinary state */
-	V = 1 /* stops 'u' from entering umlaut state */
-	U = 2 /* umlaut state, allows e-deletion */
-)
-
-type GermanNormalizeFilter struct {
-}
-
-func NewGermanNormalizeFilter() *GermanNormalizeFilter {
-	return &GermanNormalizeFilter{}
-}
-
-func (s *GermanNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream {
-	for _, token := range input {
-		term := normalize(token.Term)
-		token.Term = term
-	}
-	return input
-}
-
-func normalize(input []byte) []byte {
-	state := N
-	runes := bytes.Runes(input)
-	for i := 0; i < len(runes); i++ {
-		switch runes[i] {
-		case 'a', 'o':
-			state = U
-		case 'u':
-			if state == N {
-				state = U
-			} else {
-				state = V
-			}
-		case 'e':
-			if state == U {
-				runes = analysis.DeleteRune(runes, i)
-				i--
-			}
-			state = V
-		case 'i', 'q', 'y':
-			state = V
-		case 'ä':
-			runes[i] = 'a'
-			state = V
-		case 'ö':
-			runes[i] = 'o'
-			state = V
-		case 'ü':
-			runes[i] = 'u'
-			state = V
-		case 'ß':
-			runes[i] = 's'
-			i++
-			// newrunes := make([]rune, len(runes)+1)
-			// copy(newrunes, runes)
-			// runes = newrunes
-			// runes[i] = 's'
-			runes = analysis.InsertRune(runes, i, 's')
-			state = N
-		default:
-			state = N
-		}
-	}
-	return analysis.BuildTermFromRunes(runes)
-}
-
-func NormalizerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	return NewGermanNormalizeFilter(), nil
-}
-
-func init() {
-	registry.RegisterTokenFilter(NormalizeName, NormalizerFilterConstructor)
-}

+ 0 - 98
analysis/language/de/german_normalize_test.go

@@ -1,98 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-package de
-
-import (
-	"reflect"
-	"testing"
-
-	"github.com/blevesearch/bleve/analysis"
-)
-
-func TestGermanNormalizeFilter(t *testing.T) {
-	tests := []struct {
-		input  analysis.TokenStream
-		output analysis.TokenStream
-	}{
-		// Tests that a/o/u + e is equivalent to the umlaut form
-		{
-			input: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("Schaltflächen"),
-				},
-			},
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("Schaltflachen"),
-				},
-			},
-		},
-		{
-			input: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("Schaltflaechen"),
-				},
-			},
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("Schaltflachen"),
-				},
-			},
-		},
-		// Tests the specific heuristic that ue is not folded after a vowel or q.
-		{
-			input: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("dauer"),
-				},
-			},
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("dauer"),
-				},
-			},
-		},
-		// Tests german specific folding of sharp-s
-		{
-			input: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("weißbier"),
-				},
-			},
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("weissbier"),
-				},
-			},
-		},
-		// empty
-		{
-			input: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte(""),
-				},
-			},
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte(""),
-				},
-			},
-		},
-	}
-
-	germanNormalizeFilter := NewGermanNormalizeFilter()
-	for _, test := range tests {
-		actual := germanNormalizeFilter.Filter(test.input)
-		if !reflect.DeepEqual(actual, test.output) {
-			t.Errorf("expected %#v, got %#v", test.output, actual)
-			t.Errorf("expected %s(% x), got %s(% x)", test.output[0].Term, test.output[0].Term, actual[0].Term, actual[0].Term)
-		}
-	}
-}

+ 0 - 28
analysis/language/de/stemmer_de.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-
-package de
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StemmerName = "stemmer_de"
-
-func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	return stemmer_filter.NewStemmerFilter("de")
-}
-
-func init() {
-	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
-}

+ 0 - 28
analysis/language/de/stop_filter_de.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-package de
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	tokenMap, err := cache.TokenMapNamed(StopName)
-	if err != nil {
-		return nil, err
-	}
-	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
-}
-
-func init() {
-	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
-}

+ 0 - 318
analysis/language/de/stop_words_de.go

@@ -1,318 +0,0 @@
-package de
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StopName = "stop_de"
-
-// this content was obtained from:
-// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
-// ` was changed to ' to allow for literal string
-
-var GermanStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/german/stop.txt
- | This file is distributed under the BSD License.
- | See http://snowball.tartarus.org/license.php
- | Also see http://www.opensource.org/licenses/bsd-license.html
- |  - Encoding was converted to UTF-8.
- |  - This notice was added.
- |
- | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
-
- | A German stop word list. Comments begin with vertical bar. Each stop
- | word is at the start of a line.
-
- | The number of forms in this list is reduced significantly by passing it
- | through the German stemmer.
-
-
-aber           |  but
-
-alle           |  all
-allem
-allen
-aller
-alles
-
-als            |  than, as
-also           |  so
-am             |  an + dem
-an             |  at
-
-ander          |  other
-andere
-anderem
-anderen
-anderer
-anderes
-anderm
-andern
-anderr
-anders
-
-auch           |  also
-auf            |  on
-aus            |  out of
-bei            |  by
-bin            |  am
-bis            |  until
-bist           |  art
-da             |  there
-damit          |  with it
-dann           |  then
-
-der            |  the
-den
-des
-dem
-die
-das
-
-daß            |  that
-
-derselbe       |  the same
-derselben
-denselben
-desselben
-demselben
-dieselbe
-dieselben
-dasselbe
-
-dazu           |  to that
-
-dein           |  thy
-deine
-deinem
-deinen
-deiner
-deines
-
-denn           |  because
-
-derer          |  of those
-dessen         |  of him
-
-dich           |  thee
-dir            |  to thee
-du             |  thou
-
-dies           |  this
-diese
-diesem
-diesen
-dieser
-dieses
-
-
-doch           |  (several meanings)
-dort           |  (over) there
-
-
-durch          |  through
-
-ein            |  a
-eine
-einem
-einen
-einer
-eines
-
-einig          |  some
-einige
-einigem
-einigen
-einiger
-einiges
-
-einmal         |  once
-
-er             |  he
-ihn            |  him
-ihm            |  to him
-
-es             |  it
-etwas          |  something
-
-euer           |  your
-eure
-eurem
-euren
-eurer
-eures
-
-für            |  for
-gegen          |  towards
-gewesen        |  p.p. of sein
-hab            |  have
-habe           |  have
-haben          |  have
-hat            |  has
-hatte          |  had
-hatten         |  had
-hier           |  here
-hin            |  there
-hinter         |  behind
-
-ich            |  I
-mich           |  me
-mir            |  to me
-
-
-ihr            |  you, to her
-ihre
-ihrem
-ihren
-ihrer
-ihres
-euch           |  to you
-
-im             |  in + dem
-in             |  in
-indem          |  while
-ins            |  in + das
-ist            |  is
-
-jede           |  each, every
-jedem
-jeden
-jeder
-jedes
-
-jene           |  that
-jenem
-jenen
-jener
-jenes
-
-jetzt          |  now
-kann           |  can
-
-kein           |  no
-keine
-keinem
-keinen
-keiner
-keines
-
-können         |  can
-könnte         |  could
-machen         |  do
-man            |  one
-
-manche         |  some, many a
-manchem
-manchen
-mancher
-manches
-
-mein           |  my
-meine
-meinem
-meinen
-meiner
-meines
-
-mit            |  with
-muss           |  must
-musste         |  had to
-nach           |  to(wards)
-nicht          |  not
-nichts         |  nothing
-noch           |  still, yet
-nun            |  now
-nur            |  only
-ob             |  whether
-oder           |  or
-ohne           |  without
-sehr           |  very
-
-sein           |  his
-seine
-seinem
-seinen
-seiner
-seines
-
-selbst         |  self
-sich           |  herself
-
-sie            |  they, she
-ihnen          |  to them
-
-sind           |  are
-so             |  so
-
-solche         |  such
-solchem
-solchen
-solcher
-solches
-
-soll           |  shall
-sollte         |  should
-sondern        |  but
-sonst          |  else
-über           |  over
-um             |  about, around
-und            |  and
-
-uns            |  us
-unse
-unsem
-unsen
-unser
-unses
-
-unter          |  under
-viel           |  much
-vom            |  von + dem
-von            |  from
-vor            |  before
-während        |  while
-war            |  was
-waren          |  were
-warst          |  wast
-was            |  what
-weg            |  away, off
-weil           |  because
-weiter         |  further
-
-welche         |  which
-welchem
-welchen
-welcher
-welches
-
-wenn           |  when
-werde          |  will
-werden         |  will
-wie            |  how
-wieder         |  again
-will           |  want
-wir            |  we
-wird           |  will
-wirst          |  willst
-wo             |  where
-wollen         |  want
-wollte         |  wanted
-würde          |  would
-würden         |  would
-zu             |  to
-zum            |  zu + dem
-zur            |  zu + der
-zwar           |  indeed
-zwischen       |  between
-
-`)
-
-func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
-	rv := analysis.NewTokenMap()
-	err := rv.LoadBytes(GermanStopWords)
-	return rv, err
-}
-
-func init() {
-	registry.RegisterTokenMap(StopName, TokenMapConstructor)
-}

+ 0 - 28
analysis/language/en/stemmer_en.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-
-package en
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StemmerName = "stemmer_en"
-
-func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	return stemmer_filter.NewStemmerFilter("en")
-}
-
-func init() {
-	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
-}

+ 0 - 72
analysis/language/en/stemmer_en_test.go

@@ -1,72 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-
-package en
-
-import (
-	"reflect"
-	"testing"
-
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func TestEnglishStemmer(t *testing.T) {
-	tests := []struct {
-		input  analysis.TokenStream
-		output analysis.TokenStream
-	}{
-		{
-			input: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("walking"),
-				},
-				&analysis.Token{
-					Term: []byte("talked"),
-				},
-				&analysis.Token{
-					Term: []byte("business"),
-				},
-				&analysis.Token{
-					Term:    []byte("protected"),
-					KeyWord: true,
-				},
-			},
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("walk"),
-				},
-				&analysis.Token{
-					Term: []byte("talk"),
-				},
-				&analysis.Token{
-					Term: []byte("busi"),
-				},
-				&analysis.Token{
-					Term:    []byte("protected"),
-					KeyWord: true,
-				},
-			},
-		},
-	}
-
-	cache := registry.NewCache()
-	stemmerFilter, err := cache.TokenFilterNamed(StemmerName)
-	if err != nil {
-		t.Fatal(err)
-	}
-	for _, test := range tests {
-		actual := stemmerFilter.Filter(test.input)
-		if !reflect.DeepEqual(actual, test.output) {
-			t.Errorf("expected %s, got %s", test.output, actual)
-		}
-	}
-}

+ 0 - 55
analysis/language/es/analyzer_es.go

@@ -1,55 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package es
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-
-	"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
-	"github.com/blevesearch/bleve/analysis/tokenizers/icu"
-)
-
-const AnalyzerName = "es"
-
-func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
-	icuTokenizer, err := cache.TokenizerNamed(icu.Name)
-	if err != nil {
-		return nil, err
-	}
-	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
-	if err != nil {
-		return nil, err
-	}
-	stopEsFilter, err := cache.TokenFilterNamed(StopName)
-	if err != nil {
-		return nil, err
-	}
-	stemmerEsFilter, err := cache.TokenFilterNamed(StemmerName)
-	if err != nil {
-		return nil, err
-	}
-	rv := analysis.Analyzer{
-		Tokenizer: icuTokenizer,
-		TokenFilters: []analysis.TokenFilter{
-			toLowerFilter,
-			stopEsFilter,
-			stemmerEsFilter,
-		},
-	}
-	return &rv, nil
-}
-
-func init() {
-	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
-}

+ 0 - 64
analysis/language/es/analyzer_es_test.go

@@ -1,64 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package es
-
-import (
-	"reflect"
-	"testing"
-
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func TestSpanishAnalyzer(t *testing.T) {
-	tests := []struct {
-		input  []byte
-		output analysis.TokenStream
-	}{
-		// stemming
-		{
-			input: []byte("chicana"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term:     []byte("chican"),
-					Position: 1,
-					Start:    0,
-					End:      7,
-				},
-			},
-		},
-		{
-			input: []byte("chicano"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term:     []byte("chican"),
-					Position: 1,
-					Start:    0,
-					End:      7,
-				},
-			},
-		},
-	}
-
-	cache := registry.NewCache()
-	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
-	if err != nil {
-		t.Fatal(err)
-	}
-	for _, test := range tests {
-		actual := analyzer.Analyze(test.input)
-		if !reflect.DeepEqual(actual, test.output) {
-			t.Errorf("expected %v, got %v", test.output, actual)
-		}
-	}
-}

+ 0 - 28
analysis/language/es/stemmer_es.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-
-package es
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StemmerName = "stemmer_es"
-
-func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	return stemmer_filter.NewStemmerFilter("es")
-}
-
-func init() {
-	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
-}

+ 0 - 28
analysis/language/es/stop_filter_es.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-package es
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	tokenMap, err := cache.TokenMapNamed(StopName)
-	if err != nil {
-		return nil, err
-	}
-	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
-}
-
-func init() {
-	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
-}

+ 0 - 380
analysis/language/es/stop_words_es.go

@@ -1,380 +0,0 @@
-package es
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StopName = "stop_es"
-
-// this content was obtained from:
-// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
-// ` was changed to ' to allow for literal string
-
-var SpanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/spanish/stop.txt
- | This file is distributed under the BSD License.
- | See http://snowball.tartarus.org/license.php
- | Also see http://www.opensource.org/licenses/bsd-license.html
- |  - Encoding was converted to UTF-8.
- |  - This notice was added.
- |
- | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
-
- | A Spanish stop word list. Comments begin with vertical bar. Each stop
- | word is at the start of a line.
-
-
- | The following is a ranked list (commonest to rarest) of stopwords
- | deriving from a large sample of text.
-
- | Extra words have been added at the end.
-
-de             |  from, of
-la             |  the, her
-que            |  who, that
-el             |  the
-en             |  in
-y              |  and
-a              |  to
-los            |  the, them
-del            |  de + el
-se             |  himself, from him etc
-las            |  the, them
-por            |  for, by, etc
-un             |  a
-para           |  for
-con            |  with
-no             |  no
-una            |  a
-su             |  his, her
-al             |  a + el
-  | es         from SER
-lo             |  him
-como           |  how
-más            |  more
-pero           |  pero
-sus            |  su plural
-le             |  to him, her
-ya             |  already
-o              |  or
-  | fue        from SER
-este           |  this
-  | ha         from HABER
-sí             |  himself etc
-porque         |  because
-esta           |  this
-  | son        from SER
-entre          |  between
-  | está     from ESTAR
-cuando         |  when
-muy            |  very
-sin            |  without
-sobre          |  on
-  | ser        from SER
-  | tiene      from TENER
-también        |  also
-me             |  me
-hasta          |  until
-hay            |  there is/are
-donde          |  where
-  | han        from HABER
-quien          |  whom, that
-  | están      from ESTAR
-  | estado     from ESTAR
-desde          |  from
-todo           |  all
-nos            |  us
-durante        |  during
-  | estados    from ESTAR
-todos          |  all
-uno            |  a
-les            |  to them
-ni             |  nor
-contra         |  against
-otros          |  other
-  | fueron     from SER
-ese            |  that
-eso            |  that
-  | había      from HABER
-ante           |  before
-ellos          |  they
-e              |  and (variant of y)
-esto           |  this
-mí             |  me
-antes          |  before
-algunos        |  some
-qué            |  what?
-unos           |  a
-yo             |  I
-otro           |  other
-otras          |  other
-otra           |  other
-él             |  he
-tanto          |  so much, many
-esa            |  that
-estos          |  these
-mucho          |  much, many
-quienes        |  who
-nada           |  nothing
-muchos         |  many
-cual           |  who
-  | sea        from SER
-poco           |  few
-ella           |  she
-estar          |  to be
-  | haber      from HABER
-estas          |  these
-  | estaba     from ESTAR
-  | estamos    from ESTAR
-algunas        |  some
-algo           |  something
-nosotros       |  we
-
-      | other forms
-
-mi             |  me
-mis            |  mi plural
-tú             |  thou
-te             |  thee
-ti             |  thee
-tu             |  thy
-tus            |  tu plural
-ellas          |  they
-nosotras       |  we
-vosotros       |  you
-vosotras       |  you
-os             |  you
-mío            |  mine
-mía            |
-míos           |
-mías           |
-tuyo           |  thine
-tuya           |
-tuyos          |
-tuyas          |
-suyo           |  his, hers, theirs
-suya           |
-suyos          |
-suyas          |
-nuestro        |  ours
-nuestra        |
-nuestros       |
-nuestras       |
-vuestro        |  yours
-vuestra        |
-vuestros       |
-vuestras       |
-esos           |  those
-esas           |  those
-
-               | forms of estar, to be (not including the infinitive):
-estoy
-estás
-está
-estamos
-estáis
-están
-esté
-estés
-estemos
-estéis
-estén
-estaré
-estarás
-estará
-estaremos
-estaréis
-estarán
-estaría
-estarías
-estaríamos
-estaríais
-estarían
-estaba
-estabas
-estábamos
-estabais
-estaban
-estuve
-estuviste
-estuvo
-estuvimos
-estuvisteis
-estuvieron
-estuviera
-estuvieras
-estuviéramos
-estuvierais
-estuvieran
-estuviese
-estuvieses
-estuviésemos
-estuvieseis
-estuviesen
-estando
-estado
-estada
-estados
-estadas
-estad
-
-               | forms of haber, to have (not including the infinitive):
-he
-has
-ha
-hemos
-habéis
-han
-haya
-hayas
-hayamos
-hayáis
-hayan
-habré
-habrás
-habrá
-habremos
-habréis
-habrán
-habría
-habrías
-habríamos
-habríais
-habrían
-había
-habías
-habíamos
-habíais
-habían
-hube
-hubiste
-hubo
-hubimos
-hubisteis
-hubieron
-hubiera
-hubieras
-hubiéramos
-hubierais
-hubieran
-hubiese
-hubieses
-hubiésemos
-hubieseis
-hubiesen
-habiendo
-habido
-habida
-habidos
-habidas
-
-               | forms of ser, to be (not including the infinitive):
-soy
-eres
-es
-somos
-sois
-son
-sea
-seas
-seamos
-seáis
-sean
-seré
-serás
-será
-seremos
-seréis
-serán
-sería
-serías
-seríamos
-seríais
-serían
-era
-eras
-éramos
-erais
-eran
-fui
-fuiste
-fue
-fuimos
-fuisteis
-fueron
-fuera
-fueras
-fuéramos
-fuerais
-fueran
-fuese
-fueses
-fuésemos
-fueseis
-fuesen
-siendo
-sido
-  |  sed also means 'thirst'
-
-               | forms of tener, to have (not including the infinitive):
-tengo
-tienes
-tiene
-tenemos
-tenéis
-tienen
-tenga
-tengas
-tengamos
-tengáis
-tengan
-tendré
-tendrás
-tendrá
-tendremos
-tendréis
-tendrán
-tendría
-tendrías
-tendríamos
-tendríais
-tendrían
-tenía
-tenías
-teníamos
-teníais
-tenían
-tuve
-tuviste
-tuvo
-tuvimos
-tuvisteis
-tuvieron
-tuviera
-tuvieras
-tuviéramos
-tuvierais
-tuvieran
-tuviese
-tuvieses
-tuviésemos
-tuvieseis
-tuviesen
-teniendo
-tenido
-tenida
-tenidos
-tenidas
-tened
-
-`)
-
-func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
-	rv := analysis.NewTokenMap()
-	err := rv.LoadBytes(SpanishStopWords)
-	return rv, err
-}
-
-func init() {
-	registry.RegisterTokenMap(StopName, TokenMapConstructor)
-}

+ 3 - 5
analysis/language/fa/analyzer_fa.go

@@ -7,8 +7,6 @@
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 
-// +build icu full
-
 package fa
 
 import (
@@ -18,7 +16,7 @@ import (
 	"github.com/blevesearch/bleve/analysis/char_filters/zero_width_non_joiner"
 	"github.com/blevesearch/bleve/analysis/language/ar"
 	"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
-	"github.com/blevesearch/bleve/analysis/tokenizers/icu"
+	"github.com/blevesearch/bleve/analysis/tokenizers/unicode"
 )
 
 const AnalyzerName = "fa"
@@ -28,7 +26,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
 	if err != nil {
 		return nil, err
 	}
-	icuTokenizer, err := cache.TokenizerNamed(icu.Name)
+	unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name)
 	if err != nil {
 		return nil, err
 	}
@@ -52,7 +50,7 @@ func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (
 		CharFilters: []analysis.CharFilter{
 			zFilter,
 		},
-		Tokenizer: icuTokenizer,
+		Tokenizer: unicodeTokenizer,
 		TokenFilters: []analysis.TokenFilter{
 			toLowerFilter,
 			normArFilter,

+ 0 - 2
analysis/language/fa/analyzer_fa_test.go

@@ -7,8 +7,6 @@
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 
-// +build icu full
-
 package fa
 
 import (

+ 0 - 55
analysis/language/fi/analyzer_fi.go

@@ -1,55 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package fi
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-
-	"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
-	"github.com/blevesearch/bleve/analysis/tokenizers/icu"
-)
-
-const AnalyzerName = "fi"
-
-func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
-	icuTokenizer, err := cache.TokenizerNamed(icu.Name)
-	if err != nil {
-		return nil, err
-	}
-	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
-	if err != nil {
-		return nil, err
-	}
-	stopFiFilter, err := cache.TokenFilterNamed(StopName)
-	if err != nil {
-		return nil, err
-	}
-	stemmerFiFilter, err := cache.TokenFilterNamed(StemmerName)
-	if err != nil {
-		return nil, err
-	}
-	rv := analysis.Analyzer{
-		Tokenizer: icuTokenizer,
-		TokenFilters: []analysis.TokenFilter{
-			toLowerFilter,
-			stopFiFilter,
-			stemmerFiFilter,
-		},
-	}
-	return &rv, nil
-}
-
-func init() {
-	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
-}

+ 0 - 68
analysis/language/fi/analyzer_fi_test.go

@@ -1,68 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package fi
-
-import (
-	"reflect"
-	"testing"
-
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func TestFinishAnalyzer(t *testing.T) {
-	tests := []struct {
-		input  []byte
-		output analysis.TokenStream
-	}{
-		// stemming
-		{
-			input: []byte("edeltäjiinsä"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("edeltäj"),
-				},
-			},
-		},
-		{
-			input: []byte("edeltäjistään"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("edeltäj"),
-				},
-			},
-		},
-		// stop word
-		{
-			input:  []byte("olla"),
-			output: analysis.TokenStream{},
-		},
-	}
-
-	cache := registry.NewCache()
-	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
-	if err != nil {
-		t.Fatal(err)
-	}
-	for _, test := range tests {
-		actual := analyzer.Analyze(test.input)
-		if len(actual) != len(test.output) {
-			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
-		}
-		for i, tok := range actual {
-			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
-				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
-			}
-		}
-	}
-}

+ 0 - 28
analysis/language/fi/stemmer_fi.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-
-package fi
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StemmerName = "stemmer_fi"
-
-func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	return stemmer_filter.NewStemmerFilter("fi")
-}
-
-func init() {
-	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
-}

+ 0 - 28
analysis/language/fi/stop_filter_fi.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-package fi
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	tokenMap, err := cache.TokenMapNamed(StopName)
-	if err != nil {
-		return nil, err
-	}
-	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
-}
-
-func init() {
-	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
-}

+ 0 - 121
analysis/language/fi/stop_words_fi.go

@@ -1,121 +0,0 @@
-package fi
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StopName = "stop_fi"
-
-// this content was obtained from:
-// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
-// ` was changed to ' to allow for literal string
-
-var FinnishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt
- | This file is distributed under the BSD License.
- | See http://snowball.tartarus.org/license.php
- | Also see http://www.opensource.org/licenses/bsd-license.html
- |  - Encoding was converted to UTF-8.
- |  - This notice was added.
- |
- | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
- 
-| forms of BE
-
-olla
-olen
-olet
-on
-olemme
-olette
-ovat
-ole        | negative form
-
-oli
-olisi
-olisit
-olisin
-olisimme
-olisitte
-olisivat
-olit
-olin
-olimme
-olitte
-olivat
-ollut
-olleet
-
-en         | negation
-et
-ei
-emme
-ette
-eivät
-
-|Nom   Gen    Acc    Part   Iness   Elat    Illat  Adess   Ablat   Allat   Ess    Trans
-minä   minun  minut  minua  minussa minusta minuun minulla minulta minulle               | I
-sinä   sinun  sinut  sinua  sinussa sinusta sinuun sinulla sinulta sinulle               | you
-hän    hänen  hänet  häntä  hänessä hänestä häneen hänellä häneltä hänelle               | he she
-me     meidän meidät meitä  meissä  meistä  meihin meillä  meiltä  meille                | we
-te     teidän teidät teitä  teissä  teistä  teihin teillä  teiltä  teille                | you
-he     heidän heidät heitä  heissä  heistä  heihin heillä  heiltä  heille                | they
-
-tämä   tämän         tätä   tässä   tästä   tähän  tallä   tältä   tälle   tänä   täksi  | this
-tuo    tuon          tuotä  tuossa  tuosta  tuohon tuolla  tuolta  tuolle  tuona  tuoksi | that
-se     sen           sitä   siinä   siitä   siihen sillä   siltä   sille   sinä   siksi  | it
-nämä   näiden        näitä  näissä  näistä  näihin näillä  näiltä  näille  näinä  näiksi | these
-nuo    noiden        noita  noissa  noista  noihin noilla  noilta  noille  noina  noiksi | those
-ne     niiden        niitä  niissä  niistä  niihin niillä  niiltä  niille  niinä  niiksi | they
-
-kuka   kenen kenet   ketä   kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who
-ketkä  keiden ketkä  keitä  keissä  keistä  keihin keillä  keiltä  keille  keinä  keiksi | (pl)
-mikä   minkä minkä   mitä   missä   mistä   mihin  millä   miltä   mille   minä   miksi  | which what
-mitkä                                                                                    | (pl)
-
-joka   jonka         jota   jossa   josta   johon  jolla   jolta   jolle   jona   joksi  | who which
-jotka  joiden        joita  joissa  joista  joihin joilla  joilta  joille  joina  joiksi | (pl)
-
-| conjunctions
-
-että   | that
-ja     | and
-jos    | if
-koska  | because
-kuin   | than
-mutta  | but
-niin   | so
-sekä   | and
-sillä  | for
-tai    | or
-vaan   | but
-vai    | or
-vaikka | although
-
-
-| prepositions
-
-kanssa  | with
-mukaan  | according to
-noin    | about
-poikki  | across
-yli     | over, across
-
-| other
-
-kun    | when
-niin   | so
-nyt    | now
-itse   | self
-
-`)
-
-func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
-	rv := analysis.NewTokenMap()
-	err := rv.LoadBytes(FinnishStopWords)
-	return rv, err
-}
-
-func init() {
-	registry.RegisterTokenMap(StopName, TokenMapConstructor)
-}

+ 0 - 28
analysis/language/fr/stemmer_fr.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-
-package fr
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StemmerName = "stemmer_fr"
-
-func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	return stemmer_filter.NewStemmerFilter("fr")
-}
-
-func init() {
-	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
-}

+ 0 - 55
analysis/language/hu/analyzer_hu.go

@@ -1,55 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package hu
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-
-	"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
-	"github.com/blevesearch/bleve/analysis/tokenizers/icu"
-)
-
-const AnalyzerName = "hu"
-
-func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
-	icuTokenizer, err := cache.TokenizerNamed(icu.Name)
-	if err != nil {
-		return nil, err
-	}
-	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
-	if err != nil {
-		return nil, err
-	}
-	stopHuFilter, err := cache.TokenFilterNamed(StopName)
-	if err != nil {
-		return nil, err
-	}
-	stemmerHuFilter, err := cache.TokenFilterNamed(StemmerName)
-	if err != nil {
-		return nil, err
-	}
-	rv := analysis.Analyzer{
-		Tokenizer: icuTokenizer,
-		TokenFilters: []analysis.TokenFilter{
-			toLowerFilter,
-			stopHuFilter,
-			stemmerHuFilter,
-		},
-	}
-	return &rv, nil
-}
-
-func init() {
-	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
-}

+ 0 - 68
analysis/language/hu/analyzer_hu_test.go

@@ -1,68 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package hu
-
-import (
-	"reflect"
-	"testing"
-
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func TestHungarianAnalyzer(t *testing.T) {
-	tests := []struct {
-		input  []byte
-		output analysis.TokenStream
-	}{
-		// stemming
-		{
-			input: []byte("babakocsi"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("babakocs"),
-				},
-			},
-		},
-		{
-			input: []byte("babakocsijáért"),
-			output: analysis.TokenStream{
-				&analysis.Token{
-					Term: []byte("babakocs"),
-				},
-			},
-		},
-		// stop word
-		{
-			input:  []byte("által"),
-			output: analysis.TokenStream{},
-		},
-	}
-
-	cache := registry.NewCache()
-	analyzer, err := cache.AnalyzerNamed(AnalyzerName)
-	if err != nil {
-		t.Fatal(err)
-	}
-	for _, test := range tests {
-		actual := analyzer.Analyze(test.input)
-		if len(actual) != len(test.output) {
-			t.Fatalf("expected length: %d, got %d", len(test.output), len(actual))
-		}
-		for i, tok := range actual {
-			if !reflect.DeepEqual(tok.Term, test.output[i].Term) {
-				t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term)
-			}
-		}
-	}
-}

+ 0 - 28
analysis/language/hu/stemmer_hu.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-
-package hu
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StemmerName = "stemmer_hu"
-
-func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	return stemmer_filter.NewStemmerFilter("hu")
-}
-
-func init() {
-	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
-}

+ 0 - 28
analysis/language/hu/stop_filter_hu.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-package hu
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stop_tokens_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	tokenMap, err := cache.TokenMapNamed(StopName)
-	if err != nil {
-		return nil, err
-	}
-	return stop_tokens_filter.NewStopTokensFilter(tokenMap), nil
-}
-
-func init() {
-	registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor)
-}

+ 0 - 235
analysis/language/hu/stop_words_hu.go

@@ -1,235 +0,0 @@
-package hu
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StopName = "stop_hu"
-
-// this content was obtained from:
-// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/
-// ` was changed to ' to allow for literal string
-
-var HungarianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt
- | This file is distributed under the BSD License.
- | See http://snowball.tartarus.org/license.php
- | Also see http://www.opensource.org/licenses/bsd-license.html
- |  - Encoding was converted to UTF-8.
- |  - This notice was added.
- |
- | NOTE: To use this file with StopFilterFactory, you must specify format="snowball"
- 
-| Hungarian stop word list
-| prepared by Anna Tordai
-
-a
-ahogy
-ahol
-aki
-akik
-akkor
-alatt
-által
-általában
-amely
-amelyek
-amelyekben
-amelyeket
-amelyet
-amelynek
-ami
-amit
-amolyan
-amíg
-amikor
-át
-abban
-ahhoz
-annak
-arra
-arról
-az
-azok
-azon
-azt
-azzal
-azért
-aztán
-azután
-azonban
-bár
-be
-belül
-benne
-cikk
-cikkek
-cikkeket
-csak
-de
-e
-eddig
-egész
-egy
-egyes
-egyetlen
-egyéb
-egyik
-egyre
-ekkor
-el
-elég
-ellen
-elő
-először
-előtt
-első
-én
-éppen
-ebben
-ehhez
-emilyen
-ennek
-erre
-ez
-ezt
-ezek
-ezen
-ezzel
-ezért
-és
-fel
-felé
-hanem
-hiszen
-hogy
-hogyan
-igen
-így
-illetve
-ill.
-ill
-ilyen
-ilyenkor
-ison
-ismét
-itt
-jó
-jól
-jobban
-kell
-kellett
-keresztül
-keressünk
-ki
-kívül
-között
-közül
-legalább
-lehet
-lehetett
-legyen
-lenne
-lenni
-lesz
-lett
-maga
-magát
-majd
-majd
-már
-más
-másik
-meg
-még
-mellett
-mert
-mely
-melyek
-mi
-mit
-míg
-miért
-milyen
-mikor
-minden
-mindent
-mindenki
-mindig
-mint
-mintha
-mivel
-most
-nagy
-nagyobb
-nagyon
-ne
-néha
-nekem
-neki
-nem
-néhány
-nélkül
-nincs
-olyan
-ott
-össze
-ők
-őket
-pedig
-persze
-rá
-s
-saját
-sem
-semmi
-sok
-sokat
-sokkal
-számára
-szemben
-szerint
-szinte
-talán
-tehát
-teljes
-tovább
-továbbá
-több
-úgy
-ugyanis
-új
-újabb
-újra
-után
-utána
-utolsó
-vagy
-vagyis
-valaki
-valami
-valamint
-való
-vagyok
-van
-vannak
-volt
-voltam
-voltak
-voltunk
-vissza
-vele
-viszont
-volna
-`)
-
-func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) {
-	rv := analysis.NewTokenMap()
-	err := rv.LoadBytes(HungarianStopWords)
-	return rv, err
-}
-
-func init() {
-	registry.RegisterTokenMap(StopName, TokenMapConstructor)
-}

+ 0 - 28
analysis/language/it/stemmer_it.go

@@ -1,28 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-
-package it
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/analysis/token_filters/stemmer_filter"
-	"github.com/blevesearch/bleve/registry"
-)
-
-const StemmerName = "stemmer_it"
-
-func StemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) {
-	return stemmer_filter.NewStemmerFilter("it")
-}
-
-func init() {
-	registry.RegisterTokenFilter(StemmerName, StemmerFilterConstructor)
-}

+ 0 - 2
analysis/language/ja/analyzer_ja.go

@@ -7,8 +7,6 @@
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 
-// +build kagome full
-
 package ja
 
 import (

+ 0 - 2
analysis/language/ja/analyzer_ja_test.go

@@ -7,8 +7,6 @@
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 
-// +build kagome full
-
 package ja
 
 import (

+ 0 - 2
analysis/language/ja/ja_morph_kagome.go

@@ -7,8 +7,6 @@
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 
-// +build kagome full
-
 package ja
 
 import (

+ 0 - 2
analysis/language/ja/ja_morph_kagome_test.go

@@ -7,8 +7,6 @@
 //  either express or implied. See the License for the specific language governing permissions
 //  and limitations under the License.
 
-// +build kagome full
-
 package ja
 
 import (

+ 0 - 55
analysis/language/nl/analyzer_nl.go

@@ -1,55 +0,0 @@
-//  Copyright (c) 2014 Couchbase, Inc.
-//  Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
-//  except in compliance with the License. You may obtain a copy of the License at
-//    http://www.apache.org/licenses/LICENSE-2.0
-//  Unless required by applicable law or agreed to in writing, software distributed under the
-//  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
-//  either express or implied. See the License for the specific language governing permissions
-//  and limitations under the License.
-
-// +build libstemmer full
-// +build icu full
-
-package nl
-
-import (
-	"github.com/blevesearch/bleve/analysis"
-	"github.com/blevesearch/bleve/registry"
-
-	"github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter"
-	"github.com/blevesearch/bleve/analysis/tokenizers/icu"
-)
-
-const AnalyzerName = "nl"
-
-func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) {
-	icuTokenizer, err := cache.TokenizerNamed(icu.Name)
-	if err != nil {
-		return nil, err
-	}
-	toLowerFilter, err := cache.TokenFilterNamed(lower_case_filter.Name)
-	if err != nil {
-		return nil, err
-	}
-	stopNlFilter, err := cache.TokenFilterNamed(StopName)
-	if err != nil {
-		return nil, err
-	}
-	stemmerNlFilter, err := cache.TokenFilterNamed(StemmerName)
-	if err != nil {
-		return nil, err
-	}
-	rv := analysis.Analyzer{
-		Tokenizer: icuTokenizer,
-		TokenFilters: []analysis.TokenFilter{
-			toLowerFilter,
-			stopNlFilter,
-			stemmerNlFilter,
-		},
-	}
-	return &rv, nil
-}
-
-func init() {
-	registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor)
-}