Initial commit with Aho-Corasick code.
[aho.git] / src / algorithm.cpp
1 #include <iostream>\r
2 #include <vector>\r
3 #include <string>\r
4 \r
5 #include "typedef.h"\r
6 #include "algorithm.h"\r
7 #include "trie.h"\r
8 \r
9 /**\r
10  * Adds patterns to trie.\r
11  */\r
12 void aho::algorithm::add_patterns_to_trie() {\r
13         uint max_size_of_patterns = 0;\r
14 \r
15         for(uint i = 0; i < this->patterns.size(); i++) {\r
16                 max_size_of_patterns = max_size_of_patterns < this->patterns[i].size() ? this->patterns[i].size() : max_size_of_patterns;\r
17         }\r
18 \r
19         for(uint letter_iterator = 0; letter_iterator < max_size_of_patterns; letter_iterator++) {\r
20                 for(uint pattern_iterator = 0; pattern_iterator < this->patterns.size(); pattern_iterator++) {\r
21                         tstring current_pattern = this->patterns[pattern_iterator];\r
22 \r
23                         if(letter_iterator < current_pattern.size()) {\r
24                                 this->trie->add_child(new aho::trie(current_pattern.substr(0, letter_iterator + 1)));\r
25                         }\r
26                 }\r
27         }\r
28 }\r
29 \r
30 void aho::algorithm::set_patterns_in_trie() {\r
31         //Iterate over all patterns\r
32         for(uint i = 0; i < this->patterns.size(); i++) {\r
33                 //Return to root node.\r
34                 class aho::trie * node = this->trie;\r
35                 tstring pattern = this->patterns[i];\r
36 \r
37                 //Search for pattern in trie\r
38                 for(uint j = 0; j < pattern.size(); j++) {\r
39                         node = node->get_child(pattern.substr(0, j + 1).c_str());\r
40                 }\r
41                 //Set found node as pattern\r
42                 node->is_pattern(true);\r
43         }\r
44 }\r
45 \r
46 void aho::algorithm::set_suffix_link() {\r
47         for(uint j = 0; j < this->list.size(); j++) {\r
48                 class aho::trie * node = this->list[j];\r
49 \r
50                 for(uint k = 1; k < node->get_value().size(); k++) {\r
51                         tstring suffix = (node->get_value().substr(k));\r
52 \r
53                         class aho::trie * found = NULL;\r
54                         if((found = this->trie->get_child(suffix, true)) != NULL) {\r
55                                 if(node->get_suffix(found->get_value()) == NULL) {\r
56                                         node->add_suffix(found);\r
57                                 }\r
58                                 break;\r
59                         }\r
60                                 \r
61                 }\r
62         }\r
63         return;\r
64 }\r
65 \r
66 void aho::algorithm::set_nodes_list() {\r
67         for(uint i = 0; i < this->patterns.size(); i++) {\r
68                 for(uint j = 1; j <= this->patterns[i].size(); j++) {\r
69                         class aho::trie * node = this->get_node(this->patterns[i].substr(0, j));\r
70                         //TODO: Pozbyæ siê powtarzania\r
71                         if(!this->is_on_nodes_list(node)) {\r
72                                 this->list.push_back(node);\r
73                         }\r
74                 }\r
75         }\r
76 }\r
77 \r
78 /**\r
79  * Checks if node exists on node list.\r
80  */\r
81 bool aho::algorithm::is_on_nodes_list(class aho::trie * node) {\r
82         for(uint i = 0; i < this->list.size(); i++) {\r
83                 if(this->list[i] == node) {\r
84                         return true;\r
85                 }\r
86         }\r
87         return false;\r
88 }\r
89 \r
90 /**\r
91  * Adds pattern to search in a subject text.\r
92  */\r
93 void aho::algorithm::add_search_pattern(tstring pattern) {\r
94         this->patterns.push_back(pattern);\r
95 \r
96         for(uint i = 0; i < pattern.size(); i++) {\r
97                 if(this->letters.find(pattern[i]) == tstring::npos) {\r
98                         this->letters.push_back(pattern[i]);\r
99                 }\r
100         }\r
101 }\r
102 \r
103 /**\r
104  * Create trie.\r
105  */\r
106 void aho::algorithm::generate_trie() {\r
107         this->add_patterns_to_trie();\r
108         this->set_patterns_in_trie();\r
109         this->set_nodes_list();\r
110         this->set_suffix_link();\r
111 }\r
112 \r
113 /**\r
114  * Search subject text for patterns.\r
115  */\r
116 void aho::algorithm::parse() {\r
117         uint length = this->subject.size();\r
118         class aho::trie * current_node = this->trie;\r
119 \r
120                 for(uint j = 0; j < length; j++) {\r
121                         tstring letter;\r
122                         letter = this->subject[j];\r
123 \r
124                         tstring value(current_node->get_value() + letter);\r
125 \r
126                         class aho::trie * node = current_node->get_child(value);\r
127                         class aho::trie * suffix_node = NULL;\r
128 \r
129                         if(node == NULL) {\r
130                                 //value = abc\r
131                                 //szukaj najd³u¿szego sufixu w node'zie\r
132 \r
133                                 for(uint k = 1; k < value.size(); k++) {\r
134 //                                      tstring suffix(current_node->get_value().substr(k));\r
135                                         if((suffix_node = current_node->get_suffix(value.substr(k))) != NULL) {\r
136                                                 node = suffix_node;\r
137                                                 break;\r
138                                         }\r
139                                         else if((suffix_node = current_node->get_suffix(current_node->get_value().substr(k))) != NULL) {\r
140                                                 node = suffix_node;\r
141                                                 j -= k;\r
142                                                 break;\r
143                                         }\r
144                                 }\r
145                         }\r
146                         if(node) {\r
147                                 if(this->is_pattern(node)) {\r
148 \r
149                                         this->save_result(node->get_value(), j+2-node->get_value().size());\r
150 \r
151 /*\r
152 W przypadku natrafienia na wêze³\r
153 oznaczony jako koniec s³owa, algorytm wypisuje je jako znalezione w tek\9ccie oraz sprawdza, czy czasem w tym\r
154 miejscu nie koñczy siê wiêcej wzorców przechodz¹c krawêdziami fail a¿ do korzenia sprawdzaj¹c, czy nie\r
155 przechodzi po wierzcho³ku bêd¹cym koñcem wzorca.\r
156 */\r
157                                         //Sprawd\9f suffixy je\9cli istniej¹\r
158                                         class aho::trie * other = node->get_suffix();\r
159                                         while(other != NULL) {\r
160                                                 if(this->is_pattern(other)) {\r
161                                                         this->save_result(other->get_value(), j+2-other->get_value().size());\r
162                                                 }\r
163                                                 if(other->get_child(other->get_value() + this->subject[j+1])) {\r
164                                                         break;\r
165                                                 }\r
166                                                 other = other->get_suffix();\r
167                                         }\r
168                                         if(!other) {\r
169                                                 other = this->trie;\r
170                                         }\r
171                                         //Zgubi³ w¹tek, bo natrafi³ na literê, która nie jest dzieckiem.\r
172                                         //Przejecha³ po suffixach...\r
173                                         //Pozosta³o sprawdziæ, czy na podstawie\r
174                                         if(!node->get_child(node->get_value() + this->subject[j+1])) {\r
175                                                 node = other;\r
176                                         }\r
177                                 }\r
178                                 current_node = node;\r
179                         }\r
180                         current_node = node ? node : this->trie;\r
181 \r
182                 }\r
183                 \r
184         return;\r
185 }