00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #ifndef TESS_LANG_MODEL_H
00021 #define TESS_LANG_MODEL_H
00022
00023 #include <string>
00024
00025 #include "char_altlist.h"
00026 #include "cube_reco_context.h"
00027 #include "cube_tuning_params.h"
00028 #include "dict.h"
00029 #include "lang_model.h"
00030 #include "tessdatamanager.h"
00031 #include "tess_lang_mod_edge.h"
00032
00033 namespace tesseract {
00034
00035 const int kStateCnt = 4;
00036 const int kNumLiteralCnt = 5;
00037
00038 class TessLangModel : public LangModel {
00039 public:
00040 TessLangModel(const string &lm_params,
00041 const string &data_file_path,
00042 bool load_system_dawg,
00043 TessdataManager *tessdata_manager,
00044 CubeRecoContext *cntxt);
00045 ~TessLangModel() {
00046 if (word_dawgs_ != NULL) {
00047 word_dawgs_->delete_data_pointers();
00048 delete word_dawgs_;
00049 }
00050 }
00051
00052
00053 inline TessLangModEdge *Root() {
00054 return NULL;
00055 }
00056
00057
00058
00059
00060 LangModEdge **GetEdges(CharAltList *alt_list,
00061 LangModEdge *edge,
00062 int *edge_cnt);
00063
00064
00065
00066
00067 bool IsValidSequence(const char_32 *sequence, bool eow_flag,
00068 LangModEdge **final_edge = NULL);
00069 bool IsLeadingPunc(char_32 ch);
00070 bool IsTrailingPunc(char_32 ch);
00071 bool IsDigit(char_32 ch);
00072
00073 void RemoveInvalidCharacters(string *lm_str);
00074 private:
00075
00076 static const Dawg *ood_dawg_;
00077 static const Dawg *number_dawg_;
00078 static const int num_state_machine_[kStateCnt][kNumLiteralCnt];
00079 static const int num_max_repeat_[kStateCnt];
00080
00081
00082
00083
00084 DawgVector *word_dawgs_;
00085
00086 static int max_edge_;
00087 static int max_ood_shape_cost_;
00088
00089
00090
00091 string lead_punc_;
00092 string trail_punc_;
00093 string num_lead_punc_;
00094 string num_trail_punc_;
00095 string operators_;
00096 string digits_;
00097 string alphas_;
00098
00099
00100
00101 string *literal_str_[kNumLiteralCnt];
00102
00103
00104 CubeRecoContext *cntxt_;
00105 bool has_case_;
00106
00107
00108 int FanOut(CharAltList *alt_list,
00109 const Dawg *dawg, EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
00110 const char_32 *str, bool root_flag, LangModEdge **edge_array);
00111
00112
00113 int Edges(const char *strng, const Dawg *dawg,
00114 EDGE_REF edge_ref, EDGE_REF edge_ref_mask,
00115 LangModEdge **edge_array);
00116
00117 int NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array);
00118
00119 int OODEdges(CharAltList *alt_list, EDGE_REF edge_ref,
00120 EDGE_REF edge_ref_mask, LangModEdge **edge_array);
00121
00122 void FreeEdges(int edge_cnt, LangModEdge **edge_array);
00123
00124
00125
00126
00127 bool IsValidSequence(LangModEdge *edge, const char_32 *sequence,
00128 bool eow_flag, LangModEdge **final_edge);
00129
00130
00131 bool LoadLangModelElements(const string &lm_params);
00132
00133
00134 int NumDawgs() const;
00135
00136
00137
00138 const Dawg *GetDawg(int index) const;
00139 };
00140 }
00141
00142 #endif // TESS_LANG_MODEL_H