00001
00002
00003
00004
00005
00006
00007
00008 #ifndef BAYES_HPP__
00009 #define BAYES_HPP__
00010
00011 #include <iostream>
00012 #include <string>
00013 #ifdef __GNUC__
00014 # include <ext/hash_map>
00015 #else
00016 # include <hash_map>
00017 #endif
00018
00019 #include "Score.h"
00020 #include <boost/filesystem/path.hpp>
00021 #include <boost/cstdint.hpp>
00022
00023 typedef boost::uintmax_t size_type;
00024
00025 #ifdef __GNUC__
00026
00027 struct StringHash
00028 {
00029 size_type operator()(const std::string& s) const
00030 { return __gnu_cxx::hash<const char*>()(s.c_str()); }
00031 };
00032
00033 typedef __gnu_cxx::hash_map<std::string, size_type, StringHash> HashMap;
00034 #elif defined _MSC_VER
00035 typedef stdext::hash_map<std::string, size_type> HashMap;
00036 #else
00037 typedef std::hash_map<std::string, size_type> HashMap;
00038 #endif
00039
00040 namespace Bayes {
00041
00042
00043
00044
00049 class HashTable
00050 {
00051 public:
00055 HashTable();
00056
00062 size_type getTotalWordCount() const { return m_nTotalCount; }
00063
00071 size_type getWordCount(const std::string& word) const;
00072
00084 template<typename Iter>
00085 void learn(Iter begin, Iter end);
00086
00098 template<typename Iter>
00099 void unlearn(Iter begin, Iter end);
00100
00111 static bool hasWhitespaces(const std::string& word);
00112
00121 void read (std::istream& in);
00122
00130 void write (std::ostream& out) const;
00131
00132 protected:
00133
00145 void learnWord(const std::string& word);
00146
00159 void unlearnWord(const std::string& word);
00160
00161 protected:
00162 HashMap m_tHashMap;
00163 size_type m_nTotalCount;
00165 static const char* const m_szWhitespaces;
00169 private:
00170 friend std::ostream& operator<< (std::ostream& out, const HashTable& ht);
00171 };
00172
00173
00174
00175 std::ostream& operator<< (std::ostream& out, const HashTable& ht);
00176
00177
00178
00179
00180
00181
00182
00199 class BayesClassifier
00200 {
00201 public:
00205 typedef enum {
00206 GOOD,
00207 BAD
00208 } ClassificationTable;
00209
00210 private:
00221 const double wordScore(const std::string& word) const;
00222
00237 template<typename Iter>
00238 void unlearn(ClassificationTable table, Iter begin, Iter end);
00239
00255 template<typename Iter>
00256 void learn(ClassificationTable table, Iter begin, Iter end);
00257
00258 public:
00272 const Score score(const char* const text) const {
00273 return score(static_cast<const string>(text));
00274 }
00275
00289 const Score score(const string& text) const;
00290
00301 void learn(ClassificationTable table, const char* const text) {
00302 learn(table, static_cast<const string>(text));
00303 }
00304
00315 void learn(ClassificationTable table, const string& text);
00316
00330 void reclassify(ClassificationTable table, const char* const text) {
00331 reclassify(table, static_cast<const string>(text));
00332 }
00333
00347 void reclassify(ClassificationTable table, const string& text);
00348
00356 void load(const char* const file) {
00357 load(boost::filesystem::path(file, boost::filesystem::native));
00358 }
00359
00367 void load(const std::string& file) {
00368 load(boost::filesystem::path(file, boost::filesystem::native));
00369 }
00370
00378 void load(const boost::filesystem::path& file);
00379
00387 void save(const char* const file) const {
00388 save(boost::filesystem::path(file, boost::filesystem::native));
00389 }
00390
00398 void save(const std::string& file) const {
00399 save(boost::filesystem::path(file, boost::filesystem::native));
00400 }
00401
00409 void save(const boost::filesystem::path& file) const;
00410
00411 private:
00412
00413 HashTable m_atHashTables[2];
00417 static const int mScoredItems;
00421 static const char* const m_szTokenSeparators;
00424 static const char* const m_szTokenSeparatorsKept;
00427 friend std::ostream & operator<< (std::ostream& out, const BayesClassifier& base);
00428 };
00429
00430
00431 std::ostream& operator<< (std::ostream& out, const BayesClassifier& base);
00432
00433 }
00434
00435 #include "Bayes_impl.h"
00436
00437 #endif