快速匹配字符串
Contents
快速匹配字符串
假设内存有一个大字符串集,里面含有约1000万个字符串,如何快速知道该字符串集是否含有某个指定的测试字符串 ? (假设内存能放下这么多字符串)
方法一: hashCode法
方法二: Trie树 http://zh.wikipedia.org/wiki/Trie
HashCode法
01
def sampleHashMap(strSet: Set[String]): Map[Int, Set[String]]={
02
strSet.groupBy{
03
s => s.hashCode
04
}
05
}
06
07
def contain(str: String, sampleMap: Map[Int, Set[String]]): Boolean = {
08
sampleMap.getOrElse(str.hashCode, Set[String]()).exists(_ equals str)
09
}
10
11
// 测试
12
val sampleSet = Set("a","b","c","AA","Archer","Jack");
13
val sampleMap = sampleHashMap(sampleSet);
14
println(contain("Archer", sampleMap)); // true
15
println(contain("A", sampleMap)); // false
16
println(contain("a", sampleMap)); // true
Trie树法
01
sealed abstract class Trie {
02
def contains(msg: String): Boolean = contains(msg, this)
03
04
@scala.annotation.tailrec
05
private def contains(msg: String, trie: Trie): Boolean = (msg, trie) match {
06
case (null, _ ) | (_ , null ) => false
07
case (StringCase(head, tail), TrieNode(edges)) => edges.get(head) match {
08
case None => false
09
case Some(subTrie) => contains(tail,subTrie)
10
}
11
case (StringCase(_,_), TrieLeaf()) => false
12
case _ => true
13
}
14
}
15
16
case class TrieNode(edges: Map[Char, Trie]) extends Trie{
17
assert(edges.size > )
18
}
19
case class TrieLeaf extends Trie
20
21
object StringCase{
22
def unapply(s: String): Option[(Char, String)] ={
23
if (s == null || s == "" ) None
24
else Some(s.head, s.tail)
25
}
26
}
27
28
object Trie{
29
30
def apply(strs: String*): Trie ={
31
apply(strs.toSet);
32
}
33
34
def apply(strSet: Set[String]) : Trie = {
35
if (strSet.size == || strSet == Set("")) TrieLeaf()
36
else {
37
val char2TrieSet = strSet.filter(_.length > ).groupBy(_.head).mapValues{
38
strs => strs.map(_.tail)
39
}.mapValues{
40
strs => apply(strs)
41
}
42
TrieNode(char2TrieSet);
43
}
44
}
45
46
}
47
48
val trie= Trie("abc","ac","b")
49
println(trie)
50
assert(trie.contains("") == true); // true
51
assert(trie.contains(null) == false); //false
52
assert(trie.contains("a") == true); // true
53
assert(trie.contains("b") == true); // true
54
assert(trie.contains("ab") == true); //true
55
assert(trie.contains("ac") == true); //true
56
assert(trie.contains("abc") == true); //true
57
assert(trie.contains("acb") == false); // false
58
assert(trie.contains("abcd") == false); //false
59
assert(trie.contains("bc") == false); //false
Author lcf
LastMod 2012-11-26