快速匹配字符串

假设内存有一个大字符串集,里面含有约1000万个字符串,如何快速知道该字符串集是否含有某个指定的测试字符串 ? (假设内存能放下这么多字符串)

方法一: hashCode法

方法二: Trie树 http://zh.wikipedia.org/wiki/Trie

HashCode法



  
    
      
        
          01
        
        
        
          def sampleHashMap(strSet: Set[String]): Map[Int, Set[String]]={ 
        
      
    
  
  
  
    
      
        
          02
        
        
        
             strSet.groupBy{ 
        
      
    
  
  
  
    
      
        
          03
        
        
        
                s => s.hashCode 
        
      
    
  
  
  
    
      
        
          04
        
        
        
             }   
        
      
    
  
  
  
    
      
        
          05
        
        
        
          } 
        
      
    
  
  
  
    
      
        
          06
        
        
        
           
        
      
    
  
  
  
    
      
        
          07
        
        
        
          def contain(str: String, sampleMap: Map[Int, Set[String]]): Boolean = {  
        
      
    
  
  
  
    
      
        
          08
        
        
        
             sampleMap.getOrElse(str.hashCode, Set[String]()).exists(_ equals str) 
        
      
    
  
  
  
    
      
        
          09
        
        
        
          } 
        
      
    
  
  
  
    
      
        
          10
        
        
        
           
        
      
    
  
  
  
    
      
        
          11
        
        
        
          // 测试 
        
      
    
  
  
  
    
      
        
          12
        
        
        
          val sampleSet = Set("a","b","c","AA","Archer","Jack"); 
        
      
    
  
  
  
    
      
        
          13
        
        
        
          val sampleMap = sampleHashMap(sampleSet); 
        
      
    
  
  
  
    
      
        
          14
        
        
        
          println(contain("Archer", sampleMap));  // true  
        
      
    
  
  
  
    
      
        
          15
        
        
        
          println(contain("A", sampleMap));    //  false 
        
      
    
  
  
  
    
      
        
          16
        
        
        
          println(contain("a", sampleMap));   // true
        
      
    
  



 Trie树法



  
    
      
        
          01
        
        
        
          sealed abstract class  Trie { 
        
      
    
  
  
  
    
      
        
          02
        
        
        
            def contains(msg: String): Boolean = contains(msg, this) 
        
      
    
  
  
  
    
      
        
          03
        
        
        
               
        
      
    
  
  
  
    
      
        
          04
        
        
        
             @scala.annotation.tailrec 
        
      
    
  
  
  
    
      
        
          05
        
        
        
             private def contains(msg: String, trie: Trie): Boolean = (msg, trie) match {   
        
      
    
  
  
  
    
      
        
          06
        
        
        
                  case (null, _ ) | (_ , null ) => false
        
      
    
  
  
  
    
      
        
          07
        
        
        
                  case (StringCase(head, tail), TrieNode(edges)) => edges.get(head) match { 
        
      
    
  
  
  
    
      
        
          08
        
        
        
                      case None => false
        
      
    
  
  
  
    
      
        
          09
        
        
        
                      case Some(subTrie) => contains(tail,subTrie) 
        
      
    
  
  
  
    
      
        
          10
        
        
        
                  } 
        
      
    
  
  
  
    
      
        
          11
        
        
        
                  case (StringCase(_,_), TrieLeaf()) => false
        
      
    
  
  
  
    
      
        
          12
        
        
        
                  case _ => true
        
      
    
  
  
  
    
      
        
          13
        
        
        
              } 
        
      
    
  
  
  
    
      
        
          14
        
        
        
          } 
        
      
    
  
  
  
    
      
        
          15
        
        
        
           
        
      
    
  
  
  
    
      
        
          16
        
        
        
          case class TrieNode(edges: Map[Char, Trie]) extends Trie{ 
        
      
    
  
  
  
    
      
        
          17
        
        
        
             assert(edges.size > ) 
        
      
    
  
  
  
    
      
        
          18
        
        
        
          } 
        
      
    
  
  
  
    
      
        
          19
        
        
        
          case class TrieLeaf extends Trie 
        
      
    
  
  
  
    
      
        
          20
        
        
        
           
        
      
    
  
  
  
    
      
        
          21
        
        
        
          object StringCase{ 
        
      
    
  
  
  
    
      
        
          22
        
        
        
              def unapply(s: String): Option[(Char, String)] ={  
        
      
    
  
  
  
    
      
        
          23
        
        
        
                if (s == null || s == "" )  None 
        
      
    
  
  
  
    
      
        
          24
        
        
        
                else Some(s.head, s.tail) 
        
      
    
  
  
  
    
      
        
          25
        
        
        
              } 
        
      
    
  
  
  
    
      
        
          26
        
        
        
          } 
        
      
    
  
  
  
    
      
        
          27
        
        
        
           
        
      
    
  
  
  
    
      
        
          28
        
        
        
          object Trie{ 
        
      
    
  
  
  
    
      
        
          29
        
        
        
               
        
      
    
  
  
  
    
      
        
          30
        
        
        
             def apply(strs: String*): Trie ={ 
        
      
    
  
  
  
    
      
        
          31
        
        
        
                apply(strs.toSet); 
        
      
    
  
  
  
    
      
        
          32
        
        
        
             } 
        
      
    
  
  
  
    
      
        
          33
        
        
        
              
        
      
    
  
  
  
    
      
        
          34
        
        
        
             def apply(strSet: Set[String]) : Trie = { 
        
      
    
  
  
  
    
      
        
          35
        
        
        
                if (strSet.size == || strSet == Set("")) TrieLeaf() 
        
      
    
  
  
  
    
      
        
          36
        
        
        
                else { 
        
      
    
  
  
  
    
      
        
          37
        
        
        
                   val char2TrieSet = strSet.filter(_.length > ).groupBy(_.head).mapValues{ 
        
      
    
  
  
  
    
      
        
          38
        
        
        
                      strs => strs.map(_.tail) 
        
      
    
  
  
  
    
      
        
          39
        
        
        
                   }.mapValues{ 
        
      
    
  
  
  
    
      
        
          40
        
        
        
                      strs => apply(strs) 
        
      
    
  
  
  
    
      
        
          41
        
        
        
                   } 
        
      
    
  
  
  
    
      
        
          42
        
        
        
                   TrieNode(char2TrieSet); 
        
      
    
  
  
  
    
      
        
          43
        
        
        
                }   
        
      
    
  
  
  
    
      
        
          44
        
        
        
             } 
        
      
    
  
  
  
    
      
        
          45
        
        
        
               
        
      
    
  
  
  
    
      
        
          46
        
        
        
          } 
        
      
    
  
  
  
    
      
        
          47
        
        
        
           
        
      
    
  
  
  
    
      
        
          48
        
        
        
          val trie= Trie("abc","ac","b") 
        
      
    
  
  
  
    
      
        
          49
        
        
        
          println(trie) 
        
      
    
  
  
  
    
      
        
          50
        
        
        
          assert(trie.contains("") == true);     // true 
        
      
    
  
  
  
    
      
        
          51
        
        
        
          assert(trie.contains(null) == false);   //false  
        
      
    
  
  
  
    
      
        
          52
        
        
        
          assert(trie.contains("a") == true);    // true 
        
      
    
  
  
  
    
      
        
          53
        
        
        
          assert(trie.contains("b") == true);    // true 
        
      
    
  
  
  
    
      
        
          54
        
        
        
          assert(trie.contains("ab") == true);   //true 
        
      
    
  
  
  
    
      
        
          55
        
        
        
          assert(trie.contains("ac") == true);   //true 
        
      
    
  
  
  
    
      
        
          56
        
        
        
          assert(trie.contains("abc") == true);   //true 
        
      
    
  
  
  
    
      
        
          57
        
        
        
          assert(trie.contains("acb") == false);   // false 
        
      
    
  
  
  
    
      
        
          58
        
        
        
          assert(trie.contains("abcd") == false);   //false 
        
      
    
  
  
  
    
      
        
          59
        
        
        
          assert(trie.contains("bc") == false);   //false