html_char_filter.go 1.2 KB

123456789101112131415161718192021222324252627282930
  1. // Copyright (c) 2014 Couchbase, Inc.
  2. // Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file
  3. // except in compliance with the License. You may obtain a copy of the License at
  4. // http://www.apache.org/licenses/LICENSE-2.0
  5. // Unless required by applicable law or agreed to in writing, software distributed under the
  6. // License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,
  7. // either express or implied. See the License for the specific language governing permissions
  8. // and limitations under the License.
  9. package html_char_filter
  10. import (
  11. "regexp"
  12. "github.com/couchbaselabs/bleve/analysis"
  13. "github.com/couchbaselabs/bleve/analysis/char_filters/regexp_char_filter"
  14. "github.com/couchbaselabs/bleve/registry"
  15. )
  16. const Name = "html"
  17. var htmlCharFilterRegexp = regexp.MustCompile(`</?[!\w]+((\s+\w+(\s*=\s*(?:".*?"|'.*?'|[^'">\s]+))?)+\s*|\s*)/?>`)
  18. func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) {
  19. replaceBytes := []byte(" ")
  20. return regexp_char_filter.NewRegexpCharFilter(htmlCharFilterRegexp, replaceBytes), nil
  21. }
  22. func init() {
  23. registry.RegisterCharFilter(Name, CharFilterConstructor)
  24. }