From 32058f00c48b61a9d794a6a29eb1b0fde5e95b07 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Rafa=C5=82=20D=C5=82ugo=C5=82=C4=99cki?= Date: Sat, 29 Jun 2013 18:38:27 +0200 Subject: [PATCH] Initial commit with Aho-Corasick code. --- src/algorithm.cpp | 185 ++++++++++++++++++++++++++++++++++++++++++++++ src/algorithm.h | 106 ++++++++++++++++++++++++++ src/main.cpp | 42 +++++++++++ src/trie.h | 171 ++++++++++++++++++++++++++++++++++++++++++ src/typedef.h | 129 ++++++++++++++++++++++++++++++++ 5 files changed, 633 insertions(+) create mode 100644 src/algorithm.cpp create mode 100644 src/algorithm.h create mode 100644 src/main.cpp create mode 100644 src/trie.h create mode 100644 src/typedef.h diff --git a/src/algorithm.cpp b/src/algorithm.cpp new file mode 100644 index 0000000..ffcc419 --- /dev/null +++ b/src/algorithm.cpp @@ -0,0 +1,185 @@ +#include +#include +#include + +#include "typedef.h" +#include "algorithm.h" +#include "trie.h" + +/** + * Adds patterns to trie. + */ +void aho::algorithm::add_patterns_to_trie() { + uint max_size_of_patterns = 0; + + for(uint i = 0; i < this->patterns.size(); i++) { + max_size_of_patterns = max_size_of_patterns < this->patterns[i].size() ? this->patterns[i].size() : max_size_of_patterns; + } + + for(uint letter_iterator = 0; letter_iterator < max_size_of_patterns; letter_iterator++) { + for(uint pattern_iterator = 0; pattern_iterator < this->patterns.size(); pattern_iterator++) { + tstring current_pattern = this->patterns[pattern_iterator]; + + if(letter_iterator < current_pattern.size()) { + this->trie->add_child(new aho::trie(current_pattern.substr(0, letter_iterator + 1))); + } + } + } +} + +void aho::algorithm::set_patterns_in_trie() { + //Iterate over all patterns + for(uint i = 0; i < this->patterns.size(); i++) { + //Return to root node. + class aho::trie * node = this->trie; + tstring pattern = this->patterns[i]; + + //Search for pattern in trie + for(uint j = 0; j < pattern.size(); j++) { + node = node->get_child(pattern.substr(0, j + 1).c_str()); + } + //Set found node as pattern + node->is_pattern(true); + } +} + +void aho::algorithm::set_suffix_link() { + for(uint j = 0; j < this->list.size(); j++) { + class aho::trie * node = this->list[j]; + + for(uint k = 1; k < node->get_value().size(); k++) { + tstring suffix = (node->get_value().substr(k)); + + class aho::trie * found = NULL; + if((found = this->trie->get_child(suffix, true)) != NULL) { + if(node->get_suffix(found->get_value()) == NULL) { + node->add_suffix(found); + } + break; + } + + } + } + return; +} + +void aho::algorithm::set_nodes_list() { + for(uint i = 0; i < this->patterns.size(); i++) { + for(uint j = 1; j <= this->patterns[i].size(); j++) { + class aho::trie * node = this->get_node(this->patterns[i].substr(0, j)); + //TODO: Pozbyæ siê powtarzania + if(!this->is_on_nodes_list(node)) { + this->list.push_back(node); + } + } + } +} + +/** + * Checks if node exists on node list. + */ +bool aho::algorithm::is_on_nodes_list(class aho::trie * node) { + for(uint i = 0; i < this->list.size(); i++) { + if(this->list[i] == node) { + return true; + } + } + return false; +} + +/** + * Adds pattern to search in a subject text. + */ +void aho::algorithm::add_search_pattern(tstring pattern) { + this->patterns.push_back(pattern); + + for(uint i = 0; i < pattern.size(); i++) { + if(this->letters.find(pattern[i]) == tstring::npos) { + this->letters.push_back(pattern[i]); + } + } +} + +/** + * Create trie. + */ +void aho::algorithm::generate_trie() { + this->add_patterns_to_trie(); + this->set_patterns_in_trie(); + this->set_nodes_list(); + this->set_suffix_link(); +} + +/** + * Search subject text for patterns. + */ +void aho::algorithm::parse() { + uint length = this->subject.size(); + class aho::trie * current_node = this->trie; + + for(uint j = 0; j < length; j++) { + tstring letter; + letter = this->subject[j]; + + tstring value(current_node->get_value() + letter); + + class aho::trie * node = current_node->get_child(value); + class aho::trie * suffix_node = NULL; + + if(node == NULL) { + //value = abc + //szukaj najd³u¿szego sufixu w node'zie + + for(uint k = 1; k < value.size(); k++) { +// tstring suffix(current_node->get_value().substr(k)); + if((suffix_node = current_node->get_suffix(value.substr(k))) != NULL) { + node = suffix_node; + break; + } + else if((suffix_node = current_node->get_suffix(current_node->get_value().substr(k))) != NULL) { + node = suffix_node; + j -= k; + break; + } + } + } + if(node) { + if(this->is_pattern(node)) { + + this->save_result(node->get_value(), j+2-node->get_value().size()); + +/* +W przypadku natrafienia na wêze³ +oznaczony jako koniec s³owa, algorytm wypisuje je jako znalezione w tekœcie oraz sprawdza, czy czasem w tym +miejscu nie koñczy siê wiêcej wzorców przechodz¹c krawêdziami fail a¿ do korzenia sprawdzaj¹c, czy nie +przechodzi po wierzcho³ku bêd¹cym koñcem wzorca. +*/ + //SprawdŸ suffixy jeœli istniej¹ + class aho::trie * other = node->get_suffix(); + while(other != NULL) { + if(this->is_pattern(other)) { + this->save_result(other->get_value(), j+2-other->get_value().size()); + } + if(other->get_child(other->get_value() + this->subject[j+1])) { + break; + } + other = other->get_suffix(); + } + if(!other) { + other = this->trie; + } + //Zgubi³ w¹tek, bo natrafi³ na literê, która nie jest dzieckiem. + //Przejecha³ po suffixach... + //Pozosta³o sprawdziæ, czy na podstawie + if(!node->get_child(node->get_value() + this->subject[j+1])) { + node = other; + } + } + current_node = node; + } + current_node = node ? node : this->trie; + + } + + return; +} \ No newline at end of file diff --git a/src/algorithm.h b/src/algorithm.h new file mode 100644 index 0000000..053303d --- /dev/null +++ b/src/algorithm.h @@ -0,0 +1,106 @@ +#ifndef AHO_ALGORITHM_H +#define AHO_ALGORITHM_H + +#include +#include +#include + +#include "typedef.h" +#include "trie.h" + +/* +At each step, the current node is extended by finding its daughter, +and if that doesn't exist, finding its suffix's daughter, +and if that doesn't work, finding its suffix's suffix's daughter, +finally ending in the root node if nothing's seen before. +*/ +namespace aho { + +// class result { +// private: +// protected: +// public: +// tstring rslt; +// int position; +// }; + +class algorithm { + private: + protected: + class aho::trie * trie; + + std::vector patterns; + tstring subject; + tstring letters; + + std::vector list; + + void add_patterns_to_trie(); + + /** + * Sets which nodes in trie, are the searched patterns. + */ + void set_patterns_in_trie(); + + /** + * Sets node list. + */ + void set_nodes_list(); + + /** + * Sets suffix links for each pattern in trie. + */ + void set_suffix_link(); + + /** + * Checks if node is a searched pattern. + */ + virtual bool is_pattern(class aho::trie * node) { + return node->is_pattern(); + } + + bool is_on_nodes_list(class aho::trie * node); + + /** + * Gets node by its value + */ + class aho::trie * get_node(tstring value) { + return this->trie->get_child(value, true); + } + + /** + * Saves search results + */ + virtual void save_result(tstring search, uint position) { + printf("%s: %d\n", search.c_str(), position); + } + public: + /** + * Algorithm constructor. Initializes variables. + */ + algorithm() : trie(new aho::trie(TXT(""))) { + } + + /** + * Algorithm destructor. + */ + ~algorithm() { + if(this->trie != NULL) { + delete this->trie; + } + } + void add_search_pattern(tstring pattern); + + /** + * Sets subject text in which patterns will be searched. + */ + void set_search_subject(tstring subject) { + this->subject = subject; + } + + void generate_trie(); + void parse(); + }; +}; + +#endif \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..cb9b3c1 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,42 @@ +#include + +#include "typedef.h" +#include "algorithm.h" +#include "trie.h" + +int main(int argc, int **argv) { + class aho::algorithm * algorithm = new aho::algorithm(); + + tfstream file("test.txt"); + uint number_of_words; + file >> number_of_words; + + for(uint i = 0; i < number_of_words; i++) { + tstring word; + file >> word; + algorithm->add_search_pattern(word); + tcout << word; + } + +// algorithm->add_search_pattern("Drogi"); +// algorithm->add_search_pattern("Marszalku"); +// algorithm->add_search_pattern("ze"); +// algorithm->add_search_pattern("i"); +// algorithm->add_search_pattern("Jak"); + tstring subject; + tstring tmp; + while(!file.eof()) { + file >> tmp; + subject += tmp + " "; + } + +// algorithm->set_search_subject("Drogi Marszalku, Wysoka Izbo. PKB roœnie. Z drugiej strony, rozszerzenie bazy o nowe rekordy poci¹ga za sob¹ proces wdrozenia i unowoczeœniania form dzia³alnoœci wymaga sprecyzowania i unowoczeœniania nowych propozycji. Ka¿dy ju¿ mówi³em jasne jest wa¿ne zadanie w kszta³towaniu kolejnych kroków w przygotowaniu i realizacji kolejnych kroków w okreœlaniu istniej¹cych kryteriów wymaga sprecyzowania i realizacji odpowiednich warunków administracyjno-finansowych. Jak ju¿ mówi³em jasne jest to, ze rozszerzenie bazy o tym, usprawnienie systemu powszechnego uczestnictwa. Przez ostatnie kilkanaœcie lat odkryliœmy ze rozszerzenie naszej dzia³alnoœci organizacyjnej rozszerza nam efekt obecnej sytuacji. Z drugiej strony, zakres i bogate doœwiadczenia pozwalaj¹ na uwadze, ze wdrozenie nowych, lepszych rozwi¹zañ wymaga niezwyk³ej precyzji w restrukturyzacji przedsiêbiorstwa. Jak ju¿ zapewne zd¹¿y³ zauwa¿yæ i¿ zakres i unowoczeœniania postaw uczestników wobec zadañ programowych jest to, ze aktualna struktura organizacji koliduje z powodu nowych propozycji. Nasza propozycja. Praktyka dnia codziennego dowodzi, ze konsultacja z dotychczasowymi zasadami kierunków postêpowego wychowania. Reasumuj¹c. nowy model dzia³alnoœci organizacyjnej ukazuje nam horyzonty kolejnych kroków w kszta³towaniu systemu szkolenia kadr koliduje z powodu obecnej sytuacji. W ten sposób dalszy rozwój ró¿nych form dzia³alnoœci organizacyjnej jest zauwazenie, ze inwestowanie."); + algorithm->set_search_subject(subject); + + algorithm->generate_trie(); + algorithm->parse(); + + delete algorithm; + + return 0; +} \ No newline at end of file diff --git a/src/trie.h b/src/trie.h new file mode 100644 index 0000000..6ccc965 --- /dev/null +++ b/src/trie.h @@ -0,0 +1,171 @@ +#ifndef AHO_TRIE_H +#define AHO_TRIE_H + +#include +#include + +#include "typedef.h" + +namespace aho { + class trie { + public: + typedef trie _Myt; + typedef tstring _string; + private: + _string value; + class aho::trie * parent; + + std::map<_string, class aho::trie *> suffixes; + std::vector children; + + bool pattern; + + /** + * Checks if a trie node should be nested in current node. + * @param node Trie node which will be compared with current node. + * @return returns true if node should be nested. False otherwise. + */ + bool should_be_nested(class aho::trie * node) { + return this->value.size() < node->get_value().size() ? true : false; + } + protected: + public: + /** + * Trie class constructor. Initializes variables with default values. + * @param value text value of trie's node. + */ + trie(_string _value) : parent(NULL), pattern(false), value(_value) { } + + /** + * Sets value of trie node. + * @param value Value which will be used to set the trie node value. + */ + void set_value(_string value) { + this->value = value; + } + + /** + * Sets parent node of this trie node. + * @param parent Parent node for current trie node. + */ + void set_parent(class aho::trie * node) { + this->parent = node; + } + + /** + * Adds node to the trie. + * @param node Trie node which will be added to the current node. + */ + void add_child(class aho::trie * node) { + //Je¿eli powinien byæ zagnie¿d¿ony (bo d³u¿szy) + if(this->should_be_nested(node)) { + uint length = this->value.size() + 1; + + //Je¿eli nie znalaz³ wpisu o danym fragmencie nazwy... + class aho::trie * tmp = this->get_child(node->get_value().substr(0, length)); + if(tmp == NULL) { + //Je¿eli to jest nadal fragment (nie jest to pe³na nazwa). + if(length < node->get_value().size()) { + tmp = new aho::trie(node->get_value().substr(0, length + 1)); + tmp->set_parent(this); + node->set_parent(tmp); + tmp->add_child(node); + this->children.push_back(tmp); + //this->children[tmp->get_value()] = tmp; + } + //WARNING! + //Je¿eli to ju¿ jest pe³na nazwa... + else { + node->set_parent(this); + this->children.push_back(node); + //this->children[node->get_value()] = node; + } + } + //Je¿eli znalaz³ wpis o takim fragmencie nazwy, to rekursywnie... + else { + if(length < node->get_value().size()) { + node->set_parent(tmp); + tmp->add_child(node); + } + } + } + //TODO: (?) + //Je¿eli ma nie byæ zagnie¿d¿ony...(Czyli, je¿eli powinien byæ dok³adnie na tym poziomie...) + else { + if(this->get_value() == node->get_value()) { + delete node; + } + } + } + + /** + * Sets suffix link of the current node with a given node. + * @param node Pointer to node, which will be linked as a suffix. + */ + void add_suffix(class aho::trie * node) { + this->suffixes[node->get_value()] = node; + } + + /** + * Gets value of current node. + * @return Returns node value. + */ + _string get_value() { + return this->value; + } + + /** + * Gets specific child node of the current one. + * @param value Specifies node which has argument's value set in value. + * @param recursive If true, longest passing node is searched recursively. Otherwise child node of current one with exactly same value is searched. + * @return Returns pointer to node, if the node has been found. NULL otherwise. + */ + class aho::trie * get_child(_string value, bool recursive = false) { + for(uint i = 0; i < this->children.size(); i++) { + //Je¿eli znalaz³, to zwróæ + if(this->children[i]->get_value() == value) { + return this->children[i]; + } + else if(recursive && (this->children[i]->get_value().compare(value.substr(0, this->children[i]->get_value().size())) == 0)) { + return this->children[i]->get_child(value, recursive); + } + } + + return NULL; + } + + /** + * Gets node's suffix + * @param value text value of suffix. + * @return Returns pointer to suffix node, if the node has been found. NULL otherwise. + */ + class aho::trie * get_suffix(_string value = TXT("")) { + return this->suffixes[value]; + } + + /** + * Gets parent node of the current trie node. + */ + class aho::trie * get_parent(){ + return this->parent; + } + + /** + * Checks if current node is one of patterns to search. + * @return Returns true, if node is one of patterns. False otherwise. + */ + bool is_pattern() { + return this->pattern; + } + + /** + * Sets node to be a pattern to search. + * @param pattern Specifies if node is a pattern. + */ + void is_pattern(bool pattern) { + this->pattern = pattern; + } + }; +}; + +#endif \ No newline at end of file diff --git a/src/typedef.h b/src/typedef.h new file mode 100644 index 0000000..2e48710 --- /dev/null +++ b/src/typedef.h @@ -0,0 +1,129 @@ +/*************************************************************************** + * Copyright (C) 2008 by Rafa³ D³ugo³êcki * + * dlugolecki.rafal@gmail.com * + * quayle@wp.pl * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +/* Rafa³ D³ugo³êcki */ + +// +// File: typedef.h +// Author: rafal +// +// Created on 29 lipiec 2008, 22:37 +// + +#ifndef _TYPEDEF_H +#define _TYPEDEF_H + +#include +#include +#include +#include +#include +#include +#include //for isalnum and others... +#include //for wide version of above +#include + +#include "config.h" +/* + * Typedefs + */ +typedef unsigned int uint; + +#ifdef USE_WCHAR + typedef wchar_t tchar; +//these are not types but variables +# define tcout std::wcout +# define tcin std::wcin +# define tcerr std::wcerr +# define TXT(x) L##x +# define TCHAR_TYPE TXT("wchar_t") + +//these are functions + +# define tisalnum(c) std::iswalnum(c) +# define tisalpha(c) std::iswalpha(c) +# define tiscntrl(c) std::iswcntrl(c) +# define tisdigit(c) std::iswdigit(c) +# define tisgraph(c) std::iswgraph(c) +# define tislower(c) std::iswlower(c) +# define tisprint(c) std::iswprint(c) +# define tispunct(c) std::iswpunct(c) +# define tisspace(c) std::iswspace(c) +# define tisupper(c) std::iswupper(c) +# define tisxdigit(c) std::iswxdigit(c) + +#else + typedef char tchar; +//these are not types but variables +# define tcout std::cout +# define tcin std::cin +# define tcerr std::cerr +# define TXT(x) x +# define TCHAR_TYPE TXT("char") +//these are functions + +# define tisalnum(c) std::isalnum(c) +# define tisalpha(c) std::isalpha(c) +# define tiscntrl(c) std::iscntrl(c) +# define tisdigit(c) std::isdigit(c) +# define tisgraph(c) std::isgraph(c) +# define tislower(c) std::islower(c) +# define tisprint(c) std::isprint(c) +# define tispunct(c) std::ispunct(c) +# define tisspace(c) std::isspace(c) +# define tisupper(c) std::isupper(c) +# define tisxdigit(c) std::isxdigit(c) + +#endif + +#ifdef USE_ICU + typedef UChar tchar; +//these are not types but variables +# define tcout std::cout +# define tcin std::cin +# define tcerr std::cerr +# define TXT(x) x +# define TCHAR_TYPE TXT("wchar_t or uint16_t") +#endif + +typedef std::basic_string tstring; + +typedef std::basic_istream tistream; +typedef std::basic_ostream tostream; + +typedef std::basic_fstream tfstream; +typedef std::basic_ifstream tifstream; +typedef std::basic_ofstream tofstream; + +typedef std::basic_stringstream tstringstream; +typedef std::basic_istringstream tistringstream; +typedef std::basic_ostringstream tostringstream; + +typedef std::basic_stringbuf tstringbuf; +typedef std::basic_filebuf tfilebuf; + +#ifdef USE_BOOST +# include +#endif + + + +#endif -- 2.30.2