bayes/Bayes.h

00001 /*----------------------------------------------------------------
00002  * project ....: bayes-irc
00003  * authors ....: shino
00004  * creation ...: 2004-10-26
00005  * revision ...: $Id: Bayes.h 25 2007-01-17 19:40:20Z shin0 $
00006  */
00007 
00008 #ifndef BAYES_HPP__
00009 #define BAYES_HPP__
00010 
00011 #include <iostream>
00012 #include <string>
00013 #ifdef __GNUC__
00014 #  include <ext/hash_map>
00015 #else
00016 #  include <hash_map>
00017 #endif
00018 
00019 #include "Score.h"
00020 #include <boost/filesystem/path.hpp>
00021 #include <boost/cstdint.hpp>
00022 
00023 typedef boost::uintmax_t size_type;
00024 
00025 #ifdef __GNUC__
00026 // for some reason there is no hash-function for std::string in gcc
00027 struct StringHash
00028 {
00029     size_type operator()(const std::string& s) const
00030         { return __gnu_cxx::hash<const char*>()(s.c_str()); }
00031 };
00032 
00033 typedef __gnu_cxx::hash_map<std::string, size_type, StringHash> HashMap;
00034 #elif defined _MSC_VER
00035 typedef stdext::hash_map<std::string, size_type> HashMap;
00036 #else
00037 typedef std::hash_map<std::string, size_type> HashMap;
00038 #endif
00039 
00040 namespace Bayes {
00041     
00042     //==============================================================================
00043     // class HashTable
00044 
00049     class HashTable
00050         {
00051        public:
00055             HashTable();
00056 
00062             size_type getTotalWordCount() const { return m_nTotalCount; }
00063 
00071             size_type getWordCount(const std::string& word) const;
00072 
00084             template<typename Iter>
00085                 void learn(Iter begin, Iter end);
00086 
00098             template<typename Iter>
00099                 void unlearn(Iter begin, Iter end);
00100 
00111             static bool hasWhitespaces(const std::string& word);
00112 
00121             void read (std::istream& in);
00122 
00130             void write (std::ostream& out) const;
00131 
00132         protected:
00133 
00145             void learnWord(const std::string& word);
00146 
00159             void unlearnWord(const std::string& word);
00160 
00161         protected:
00162             HashMap m_tHashMap; 
00163             size_type m_nTotalCount; 
00165             static const char* const m_szWhitespaces; 
00169         private:
00170             friend std::ostream& operator<< (std::ostream& out, const HashTable& ht);
00171         };
00172 
00173 
00174     //------------------------------------------------------------------------------
00175     std::ostream& operator<< (std::ostream& out, const HashTable& ht);
00176 
00177 
00178 
00179 
00180     //==============================================================================
00181     // class BayesClassifier
00182 
00199     class BayesClassifier
00200         {
00201         public:
00205             typedef enum { 
00206                 GOOD, 
00207                 BAD 
00208             } ClassificationTable; 
00209 
00210         private:
00221             const double wordScore(const std::string& word) const;
00222 
00237             template<typename Iter>
00238                 void unlearn(ClassificationTable table, Iter begin, Iter end);
00239 
00255             template<typename Iter>
00256                 void learn(ClassificationTable table, Iter begin, Iter end);
00257 
00258         public:
00272             const Score score(const char* const text) const {
00273                 return score(static_cast<const string>(text));
00274             }
00275 
00289             const Score score(const string& text) const;
00290 
00301             void learn(ClassificationTable table, const char* const text) {
00302                 learn(table, static_cast<const string>(text));
00303             }
00304 
00315             void learn(ClassificationTable table, const string& text);
00316 
00330             void reclassify(ClassificationTable table, const char* const text) {
00331                 reclassify(table, static_cast<const string>(text));
00332             }
00333 
00347             void reclassify(ClassificationTable table, const string& text);
00348 
00356             void load(const char* const file) {
00357                 load(boost::filesystem::path(file, boost::filesystem::native));
00358             }
00359 
00367             void load(const std::string& file) {
00368                 load(boost::filesystem::path(file, boost::filesystem::native));
00369             }
00370 
00378             void load(const boost::filesystem::path& file);
00379 
00387             void save(const char* const file) const {
00388                 save(boost::filesystem::path(file, boost::filesystem::native));
00389             }
00390             
00398             void save(const std::string& file) const {
00399                 save(boost::filesystem::path(file, boost::filesystem::native));
00400             }
00401             
00409             void save(const boost::filesystem::path& file) const;
00410 
00411         private:
00412             // use an array so we can use learn(size_t table, ...)
00413             HashTable m_atHashTables[2];
00417             static const int mScoredItems;
00421             static const char* const m_szTokenSeparators;
00424             static const char* const m_szTokenSeparatorsKept;
00427             friend std::ostream & operator<< (std::ostream& out, const BayesClassifier& base);
00428         };
00429 
00430     //------------------------------------------------------------------------------
00431     std::ostream& operator<< (std::ostream& out, const BayesClassifier& base);
00432 
00433 } // namespace Bayes
00434 
00435 #include "Bayes_impl.h"
00436 
00437 #endif

Generated on Sat Feb 10 21:32:39 2007 for bayes-irc by  doxygen 1.5.1