238 lines
6.7 KiB
Go
238 lines
6.7 KiB
Go
|
// Copyright 2015 The Go Authors. All rights reserved.
|
|||
|
// Use of this source code is governed by a BSD-style
|
|||
|
// license that can be found in the LICENSE file.
|
|||
|
|
|||
|
//go:generate go run ../collate/maketables.go -cldr=23 -unicode=6.2.0 -types=search,searchjl -package=search
|
|||
|
|
|||
|
// Package search provides language-specific search and string matching.
|
|||
|
//
|
|||
|
// Natural language matching can be intricate. For example, Danish will insist
|
|||
|
// "Århus" and "Aarhus" are the same name and Turkish will match I to ı (note
|
|||
|
// the lack of a dot) in a case-insensitive match. This package handles such
|
|||
|
// language-specific details.
|
|||
|
//
|
|||
|
// Text passed to any of the calls in this message does not need to be
|
|||
|
// normalized.
|
|||
|
package search // import "golang.org/x/text/search"
|
|||
|
|
|||
|
import (
|
|||
|
"strings"
|
|||
|
|
|||
|
"golang.org/x/text/internal/colltab"
|
|||
|
"golang.org/x/text/language"
|
|||
|
)
|
|||
|
|
|||
|
// An Option configures a Matcher.
|
|||
|
type Option func(*Matcher)
|
|||
|
|
|||
|
var (
|
|||
|
// WholeWord restricts matches to complete words. The default is to match at
|
|||
|
// the character level.
|
|||
|
WholeWord Option = nil
|
|||
|
|
|||
|
// Exact requires that two strings are their exact equivalent. For example
|
|||
|
// å would not match aa in Danish. It overrides any of the ignore options.
|
|||
|
Exact Option = nil
|
|||
|
|
|||
|
// Loose causes case, diacritics and width to be ignored.
|
|||
|
Loose Option = loose
|
|||
|
|
|||
|
// IgnoreCase enables case-insensitive search.
|
|||
|
IgnoreCase Option = ignoreCase
|
|||
|
|
|||
|
// IgnoreDiacritics causes diacritics to be ignored ("ö" == "o").
|
|||
|
IgnoreDiacritics Option = ignoreDiacritics
|
|||
|
|
|||
|
// IgnoreWidth equates narrow with wide variants.
|
|||
|
IgnoreWidth Option = ignoreWidth
|
|||
|
)
|
|||
|
|
|||
|
func ignoreDiacritics(m *Matcher) { m.ignoreDiacritics = true }
|
|||
|
func ignoreCase(m *Matcher) { m.ignoreCase = true }
|
|||
|
func ignoreWidth(m *Matcher) { m.ignoreWidth = true }
|
|||
|
func loose(m *Matcher) {
|
|||
|
ignoreDiacritics(m)
|
|||
|
ignoreCase(m)
|
|||
|
ignoreWidth(m)
|
|||
|
}
|
|||
|
|
|||
|
var (
|
|||
|
// Supported lists the languages for which search differs from its parent.
|
|||
|
Supported language.Coverage
|
|||
|
|
|||
|
tags []language.Tag
|
|||
|
)
|
|||
|
|
|||
|
func init() {
|
|||
|
ids := strings.Split(availableLocales, ",")
|
|||
|
tags = make([]language.Tag, len(ids))
|
|||
|
for i, s := range ids {
|
|||
|
tags[i] = language.Raw.MustParse(s)
|
|||
|
}
|
|||
|
Supported = language.NewCoverage(tags)
|
|||
|
}
|
|||
|
|
|||
|
// New returns a new Matcher for the given language and options.
|
|||
|
func New(t language.Tag, opts ...Option) *Matcher {
|
|||
|
m := &Matcher{
|
|||
|
w: getTable(locales[colltab.MatchLang(t, tags)]),
|
|||
|
}
|
|||
|
for _, f := range opts {
|
|||
|
f(m)
|
|||
|
}
|
|||
|
return m
|
|||
|
}
|
|||
|
|
|||
|
// A Matcher implements language-specific string matching.
|
|||
|
type Matcher struct {
|
|||
|
w colltab.Weighter
|
|||
|
ignoreCase bool
|
|||
|
ignoreWidth bool
|
|||
|
ignoreDiacritics bool
|
|||
|
}
|
|||
|
|
|||
|
// An IndexOption specifies how the Index methods of Pattern or Matcher should
|
|||
|
// match the input.
|
|||
|
type IndexOption byte
|
|||
|
|
|||
|
const (
|
|||
|
// Anchor restricts the search to the start (or end for Backwards) of the
|
|||
|
// text.
|
|||
|
Anchor IndexOption = 1 << iota
|
|||
|
|
|||
|
// Backwards starts the search from the end of the text.
|
|||
|
Backwards
|
|||
|
|
|||
|
anchorBackwards = Anchor | Backwards
|
|||
|
)
|
|||
|
|
|||
|
// Index reports the start and end position of the first occurrence of pat in b
|
|||
|
// or -1, -1 if pat is not present.
|
|||
|
func (m *Matcher) Index(b, pat []byte, opts ...IndexOption) (start, end int) {
|
|||
|
// TODO: implement optimized version that does not use a pattern.
|
|||
|
return m.Compile(pat).Index(b, opts...)
|
|||
|
}
|
|||
|
|
|||
|
// IndexString reports the start and end position of the first occurrence of pat
|
|||
|
// in s or -1, -1 if pat is not present.
|
|||
|
func (m *Matcher) IndexString(s, pat string, opts ...IndexOption) (start, end int) {
|
|||
|
// TODO: implement optimized version that does not use a pattern.
|
|||
|
return m.CompileString(pat).IndexString(s, opts...)
|
|||
|
}
|
|||
|
|
|||
|
// Equal reports whether a and b are equivalent.
|
|||
|
func (m *Matcher) Equal(a, b []byte) bool {
|
|||
|
_, end := m.Index(a, b, Anchor)
|
|||
|
return end == len(a)
|
|||
|
}
|
|||
|
|
|||
|
// EqualString reports whether a and b are equivalent.
|
|||
|
func (m *Matcher) EqualString(a, b string) bool {
|
|||
|
_, end := m.IndexString(a, b, Anchor)
|
|||
|
return end == len(a)
|
|||
|
}
|
|||
|
|
|||
|
// Compile compiles and returns a pattern that can be used for faster searching.
|
|||
|
func (m *Matcher) Compile(b []byte) *Pattern {
|
|||
|
p := &Pattern{m: m}
|
|||
|
iter := colltab.Iter{Weighter: m.w}
|
|||
|
for iter.SetInput(b); iter.Next(); {
|
|||
|
}
|
|||
|
p.ce = iter.Elems
|
|||
|
p.deleteEmptyElements()
|
|||
|
return p
|
|||
|
}
|
|||
|
|
|||
|
// CompileString compiles and returns a pattern that can be used for faster
|
|||
|
// searching.
|
|||
|
func (m *Matcher) CompileString(s string) *Pattern {
|
|||
|
p := &Pattern{m: m}
|
|||
|
iter := colltab.Iter{Weighter: m.w}
|
|||
|
for iter.SetInputString(s); iter.Next(); {
|
|||
|
}
|
|||
|
p.ce = iter.Elems
|
|||
|
p.deleteEmptyElements()
|
|||
|
return p
|
|||
|
}
|
|||
|
|
|||
|
// A Pattern is a compiled search string. It is safe for concurrent use.
|
|||
|
type Pattern struct {
|
|||
|
m *Matcher
|
|||
|
ce []colltab.Elem
|
|||
|
}
|
|||
|
|
|||
|
// Design note (TODO remove):
|
|||
|
// The cost of retrieving collation elements for each rune, which is used for
|
|||
|
// search as well, is not trivial. Also, algorithms like Boyer-Moore and
|
|||
|
// Sunday require some additional precomputing.
|
|||
|
|
|||
|
// Index reports the start and end position of the first occurrence of p in b
|
|||
|
// or -1, -1 if p is not present.
|
|||
|
func (p *Pattern) Index(b []byte, opts ...IndexOption) (start, end int) {
|
|||
|
// Pick a large enough buffer such that we likely do not need to allocate
|
|||
|
// and small enough to not cause too much overhead initializing.
|
|||
|
var buf [8]colltab.Elem
|
|||
|
|
|||
|
it := &colltab.Iter{
|
|||
|
Weighter: p.m.w,
|
|||
|
Elems: buf[:0],
|
|||
|
}
|
|||
|
it.SetInput(b)
|
|||
|
|
|||
|
var optMask IndexOption
|
|||
|
for _, o := range opts {
|
|||
|
optMask |= o
|
|||
|
}
|
|||
|
|
|||
|
switch optMask {
|
|||
|
case 0:
|
|||
|
return p.forwardSearch(it)
|
|||
|
case Anchor:
|
|||
|
return p.anchoredForwardSearch(it)
|
|||
|
case Backwards, anchorBackwards:
|
|||
|
panic("TODO: implement")
|
|||
|
default:
|
|||
|
panic("unrecognized option")
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// IndexString reports the start and end position of the first occurrence of p
|
|||
|
// in s or -1, -1 if p is not present.
|
|||
|
func (p *Pattern) IndexString(s string, opts ...IndexOption) (start, end int) {
|
|||
|
// Pick a large enough buffer such that we likely do not need to allocate
|
|||
|
// and small enough to not cause too much overhead initializing.
|
|||
|
var buf [8]colltab.Elem
|
|||
|
|
|||
|
it := &colltab.Iter{
|
|||
|
Weighter: p.m.w,
|
|||
|
Elems: buf[:0],
|
|||
|
}
|
|||
|
it.SetInputString(s)
|
|||
|
|
|||
|
var optMask IndexOption
|
|||
|
for _, o := range opts {
|
|||
|
optMask |= o
|
|||
|
}
|
|||
|
|
|||
|
switch optMask {
|
|||
|
case 0:
|
|||
|
return p.forwardSearch(it)
|
|||
|
case Anchor:
|
|||
|
return p.anchoredForwardSearch(it)
|
|||
|
case Backwards, anchorBackwards:
|
|||
|
panic("TODO: implement")
|
|||
|
default:
|
|||
|
panic("unrecognized option")
|
|||
|
}
|
|||
|
}
|
|||
|
|
|||
|
// TODO:
|
|||
|
// - Maybe IndexAll methods (probably not necessary).
|
|||
|
// - Some way to match patterns in a Reader (a bit tricky).
|
|||
|
// - Some fold transformer that folds text to comparable text, based on the
|
|||
|
// search options. This is a common technique, though very different from the
|
|||
|
// collation-based design of this package. It has a somewhat different use
|
|||
|
// case, so probably makes sense to support both. Should probably be in a
|
|||
|
// different package, though, as it uses completely different kind of tables
|
|||
|
// (based on norm, cases, width and range tables.)
|