Trie树的理解

性质

其基本性质可以归纳为:

  1. 跟结点不包括字符,除跟结点以外,每个结点只包含一个字符;
  2. 从跟结点到某一个结点,路径上经过的字符连接起来,为该结点对应的字符串;
  3. 每个结点的所有子结点包含的字符串不相同;

其中结点对儿子的指向,有三种方法:

  1. 对每个结点开字母集大小的数组,对应的下标是儿子所表示的字母,内容则是这个儿子对应在大数组上的位置,即标号;
  2. 对每个结点连一个链表,按一定顺序记录每个儿子;
  3. 使用树的左儿子右兄弟表示法

示意图

Trie树 理解-LMLPHP


代码:

const int MAX_CHARS = 26;

class TrieNode {
public:
TrieNode(string s) : isWord(false), word(s) {
memset(children, 0, sizeof(children));
} string word;
bool isWord; // 标记该结点是否构成单词
TrieNode* children[MAX_CHARS]; // 子树
}; class TrieTree {
public:
TrieTree():root(new TrieNode("")) {}
~TrieTree() { freeTree(root); } TrieNode * getRoot() {
return root;
} void addWord(string & s) {
TrieNode * node = root;
string t;
for (int i = 0; i < s.size(); ++i) {
t += s[i];
if (node->children[s[i] - 'a'] == NULL) {
node->children[s[i] - 'a'] = new TrieNode(t);
}
node = node->children[s[i] - 'a'];
}
node->isWord = true;
} private:
TrieNode * root;
void freeTree(TrieNode* node) {
for (int i = 0; i < MAX_CHARS; ++i) {
if (node->children[i] != NULL) {
freeTree(node->children[i]);
}
}
delete node;
}
};

例题

以例题 Word Search ||,从字符数组中连接字符并查找有无符合字符串数组的字符串;

贴上代码:

/// 使用Trie树进行
class TrieNode {
public:
TrieNode(string s) : isWord(false), word(s) {
memset(children, 0, sizeof(children));
} string word;
bool isWord; // 是否到达终点
TrieNode* children[MAX_CHARS];
}; class TrieTree {
public:
TrieTree():root(new TrieNode("")) {}
~TrieTree() { freeTree(root); } TrieNode * getRoot() {
return root;
} void addWord(string & s) {
TrieNode * node = root;
string t;
for (int i = 0; i < s.size(); ++i) {
t += s[i];
if (node->children[s[i] - 'a'] == NULL) {
node->children[s[i] - 'a'] = new TrieNode(t);
}
node = node->children[s[i] - 'a'];
}
node->isWord = true;
} private:
TrieNode * root;
void freeTree(TrieNode* node) {
for (int i = 0; i < MAX_CHARS; ++i) {
if (node->children[i] != NULL) {
freeTree(node->children[i]);
}
}
delete node;
}
}; void helper2(vector<vector<char>> & board, TrieNode* root, vector<string> & result, int i, int j) {
if (i < 0 || j < 0 || i >= board.size() || j >= board[0].size() || board[i][j] == '*')
return;
char ch = board[i][j];
root = root->children[ch - 'a'];
if (root == NULL) return;
if (root->isWord) {
result.push_back(root->word);
root->isWord = false;
}
board[i][j] = '*';
helper2(board, root, result, i - 1, j);
helper2(board, root, result, i + 1, j);
helper2(board, root, result, i, j - 1);
helper2(board, root, result, i, j + 1);
board[i][j] = ch;
}
vector<string> findWords3(vector<vector<char>>& board, vector<string>& words) {
TrieTree t;
for (int i = 0; i < words.size(); ++i) {
t.addWord(words[i]);
} vector<string> result;
for (int i = 0; i < board.size(); ++i) {
for (int j = 0; j < board[i].size(); ++j) {
helper2(board, t.getRoot(), result, i, j);
}
} return result;
} bool test() {
vector<vector<char>> board1 = {
{'o','a','a','n'},
{'e','t','a','e'},
{'i','h','k','r'},
{'i','f','l','v'}
};
vector<string> words1 = {"oath","pea","eat","rain"}; vector<vector<char>> board2 = {
{"a", "a"}
};
vector<string> words2 = {"aaa"}; vector<string> result = findWords3(board1, words1); cout << result.size() << endl; return true;
}

分析

在trie树中查找一个关键字的时间和树中包含的结点数目无关,而是取决于组成关键字的字符数。而二叉查找树的查找时间的树中的结点数目有关,其时间复杂度为O(log2n)。

如果要查找的关键字可以分解成字母序列并且不是很长,利用trie树查找速度优于二叉查找 树。比如若关键字长度最大是5,则利用trie树,利用5次比较可以从26^5=11881376个可能的关键字中检索出指定的关键字。而利用二叉查找树至少要进行Trie树 理解-LMLPHP次比较。

应用

  1. 字符串的检索,词频统计,搜索引擎的热门查询;

    像是一些大数据的问题,像是:
  1. 字符串的最长公共前缀;

    举例:
  1. 排序;

    trie树是一颗多叉树,只需要先序遍历整棵树,输出相应的字符串便是字典序的结果。

  2. 作为其他数据结构和算法的辅助结构;

    像是后缀树,AC自动机等;

这些问题,我打算在后面专门来做总结;

高级实现

双数组实现

05-11 19:40