00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00018
00019 #ifndef TESSERACT_DICT_DICT_H_
00020 #define TESSERACT_DICT_DICT_H_
00021
00022 #include "ambigs.h"
00023 #include "dawg.h"
00024 #include "host.h"
00025 #include "image.h"
00026 #include "oldlist.h"
00027 #include "ratngs.h"
00028 #include "stopper.h"
00029 #include "trie.h"
00030 #include "unicharset.h"
00031 #include "permute.h"
00032
00033 #define MAX_WERD_LENGTH (inT64) 128
00034 #define NO_RATING -1
00035
00037 struct CHAR_FRAGMENT_INFO {
00038 UNICHAR_ID unichar_id;
00039 const CHAR_FRAGMENT *fragment;
00040 int num_fragments;
00041 float rating;
00042 float certainty;
00043 };
00044
00045 namespace tesseract {
00046
00047 typedef GenericVector<Dawg *> DawgVector;
00048
00049
00050
00051
00052 static const int kAnyWordLength = -1;
00053 static const int kRatingPad = 4;
00054
00055
00056 static const char kHyphenSymbol[] = "-";
00057 static const int kMaxNumDawgEdgees = 2000000;
00058 static const int kMaxDocDawgEdges = 250000;
00059 static const int kMaxUserDawgEdges = 50000;
00060 static const float kSimCertaintyScale = -10.0;
00061 static const float kSimCertaintyOffset = -10.0;
00062 static const float kSimilarityFloor = 100.0;
00063 static const int kDocDictMaxRepChars = 4;
00064
00065 struct DawgArgs {
00066 DawgArgs(DawgInfoVector *d, DawgInfoVector *c, DawgInfoVector *ud,
00067 DawgInfoVector *uc, float r, PermuterType p, int len, int e) :
00068 active_dawgs(d), constraints(c), updated_active_dawgs(ud),
00069 updated_constraints(uc), rating_margin(r) {
00070 for (int i = 0; i < MAX_WERD_LENGTH; ++i) {
00071 rating_array[i] = NO_RATING;
00072 }
00073 permuter = p;
00074 sought_word_length = len;
00075 end_char_choice_index = e;
00076 }
00077 DawgInfoVector *active_dawgs;
00078 DawgInfoVector *constraints;
00079 DawgInfoVector *updated_active_dawgs;
00080 DawgInfoVector *updated_constraints;
00081 PermuterType permuter;
00082 int sought_word_length;
00083
00084
00085 float rating_margin;
00086 float rating_array[MAX_WERD_LENGTH];
00087 int end_char_choice_index;
00088 };
00089
00090 class Dict {
00091 public:
00092 Dict(Image* image_ptr);
00093 ~Dict();
00094 const Image* getImage() const {
00095 return image_ptr_;
00096 }
00097 Image* getImage() {
00098 return image_ptr_;
00099 }
00100 const UNICHARSET& getUnicharset() const {
00101 return getImage()->getCCUtil()->unicharset;
00102 }
00103 UNICHARSET& getUnicharset() {
00104 return getImage()->getCCUtil()->unicharset;
00105 }
00106 const UnicharAmbigs &getUnicharAmbigs() {
00107 return getImage()->getCCUtil()->unichar_ambigs;
00108 }
00109
00110 inline bool compound_marker(UNICHAR_ID unichar_id) {
00111 return (unichar_id == getUnicharset().unichar_to_id("-") ||
00112 unichar_id == getUnicharset().unichar_to_id("/"));
00113 }
00114
00115
00116
00118 inline bool hyphenated() const { return
00119 !last_word_on_line_ && hyphen_word_ && GetMaxFixedLengthDawgIndex() < 0;
00120 }
00122 inline int hyphen_base_size() const {
00123 return this->hyphenated() ? hyphen_word_->length() : 0;
00124 }
00128 inline void copy_hyphen_info(WERD_CHOICE *word) const {
00129 if (this->hyphenated()) {
00130 *word = *hyphen_word_;
00131 if (hyphen_debug_level) word->print("copy_hyphen_info: ");
00132 }
00133 }
00137 inline void remove_hyphen_head(WERD_CHOICE *word) const {
00138 if (this->hyphenated()) {
00139 word->remove_unichar_ids(0, hyphen_word_->length());
00140 if (hyphen_debug_level) hyphen_word_->print("remove_hyphen_head: ");
00141 }
00142 }
00144 inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
00145 return (last_word_on_line_ && !first_pos &&
00146 unichar_id == hyphen_unichar_id_);
00147 }
00149 inline bool has_hyphen_end(const WERD_CHOICE &word) const {
00150 int word_index = word.length() - 1;
00151 return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
00152 }
00156 void reset_hyphen_vars(bool last_word_on_line);
00159 void set_hyphen_word(const WERD_CHOICE &word,
00160 const DawgInfoVector &active_dawgs,
00161 const DawgInfoVector &constraints);
00162
00163
00166 inline void update_best_choice(const WERD_CHOICE &word,
00167 WERD_CHOICE *best_choice) {
00168 if (word.rating() < best_choice->rating()) *best_choice = word;
00169 }
00173 void init_active_dawgs(int sought_word_length,
00174 DawgInfoVector *active_dawgs,
00175 bool ambigs_mode) const;
00178 void init_constraints(DawgInfoVector *constraints) const;
00180 inline bool ambigs_mode(float rating_limit) {
00181 return rating_limit <= 0.0;
00182 }
00188 WERD_CHOICE *dawg_permute_and_select(
00189 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit,
00190 int sought_word_length, int end_char_choice_index);
00191 WERD_CHOICE *dawg_permute_and_select(
00192 const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
00193 return dawg_permute_and_select(char_choices, rating_limit,
00194 kAnyWordLength, 0);
00195 }
00203 void go_deeper_dawg_fxn(
00204 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
00205 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00206 bool word_ending, WERD_CHOICE *word, float certainties[],
00207 float *limit, WERD_CHOICE *best_choice, int *attempts_left,
00208 void *void_more_args);
00209
00210
00211 WERD_CHOICE *get_top_choice_word(
00212 const BLOB_CHOICE_LIST_VECTOR &char_choices);
00213 WERD_CHOICE *permute_top_choice(
00214 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00215 float* rating_limit,
00216 WERD_CHOICE *raw_choice,
00217 BOOL8 *any_alpha);
00218 const char* choose_il1(const char *first_char,
00219 const char *second_char,
00220 const char *third_char,
00221 const char *prev_char,
00222 const char *next_char,
00223 const char *next_next_char);
00224 WERD_CHOICE *permute_all(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00225 const WERD_CHOICE *best_choice,
00226 WERD_CHOICE *raw_choice);
00227 void end_permute();
00228 void permute_subword(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00229 float rating_limit,
00230 int start,
00231 int end,
00232 WERD_CHOICE *current_word);
00233 bool permute_characters(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00234 WERD_CHOICE *best_choice,
00235 WERD_CHOICE *raw_choice);
00236 WERD_CHOICE *permute_compound_words(
00237 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00238 float rating_limit);
00242 WERD_CHOICE *permute_fixed_length_words(
00243 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00244 PermuterState *permuter_state);
00246 void incorporate_segcost(WERD_CHOICE* word);
00250 WERD_CHOICE *permute_script_words(
00251 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00252 PermuterState *permuter_state);
00254 WERD_CHOICE *permute_chartype_words(
00255 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00256 PermuterState *permuter_state);
00257
00261 char top_word_chartype(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00262 char* pos_chartypes);
00263
00264 WERD_CHOICE *top_fragments_permute_and_select(
00265 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00266 float rating_limit);
00271 void go_deeper_top_fragments_fxn(
00272 const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
00273 int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00274 bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
00275 WERD_CHOICE *best_choice, int *attempts_left, void *more_args);
00276
00278 bool fragment_state_okay(UNICHAR_ID curr_unichar_id,
00279 float curr_rating, float curr_certainty,
00280 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00281 const char *debug, int word_ending,
00282 CHAR_FRAGMENT_INFO *char_frag_info);
00283 void permute_choices(
00284 const char *debug,
00285 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00286 int char_choice_index,
00287 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00288 WERD_CHOICE *word,
00289 float certainties[],
00290 float *limit,
00291 WERD_CHOICE *best_choice,
00292 int *attempts_left,
00293 void *more_args);
00294
00295 void append_choices(
00296 const char *debug,
00297 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00298 const BLOB_CHOICE &blob_choice,
00299 int char_choice_index,
00300 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00301 WERD_CHOICE *word,
00302 float certainties[],
00303 float *limit,
00304 WERD_CHOICE *best_choice,
00305 int *attempts_left,
00306 void *more_args);
00308 void (Dict::*go_deeper_fxn_)(const char *debug,
00309 const BLOB_CHOICE_LIST_VECTOR &char_choices,
00310 int char_choice_index,
00311 const CHAR_FRAGMENT_INFO *prev_char_frag_info,
00312 bool word_ending, WERD_CHOICE *word,
00313 float certainties[], float *limit,
00314 WERD_CHOICE *best_choice, int *attempts_left,
00315 void *void_more_args);
00316
00317 bool NoDangerousAmbig(WERD_CHOICE *BestChoice,
00318 DANGERR *fixpt,
00319 bool fix_replaceable,
00320 BLOB_CHOICE_LIST_VECTOR *Choices,
00321 bool *modified_blobs);
00322 double StopperAmbigThreshold(double f1, double f2) {
00323 return (f2 - f1) * stopper_ambiguity_threshold_gain -
00324 stopper_ambiguity_threshold_offset;
00325 }
00326
00327
00328
00329 int FreeBadChoice(void *item1,
00330 void *item2);
00339 void ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
00340 UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
00341 BLOB_CHOICE_LIST_VECTOR *blob_choices,
00342 bool *modified_blobs);
00343
00344 inline void DisableChoiceAccum() { keep_word_choices_ = false; }
00345 inline void EnableChoiceAccum() { keep_word_choices_ = true; }
00346 inline bool ChoiceAccumEnabled() { return keep_word_choices_; }
00347
00349 int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice);
00356 VIABLE_CHOICE NewViableChoice(const WERD_CHOICE &WordChoice,
00357 FLOAT32 AdjustFactor,
00358 const float Certainties[]);
00360 void PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice);
00363 bool StringSameAs(const WERD_CHOICE &WordChoice,
00364 VIABLE_CHOICE ViableChoice);
00366 bool StringSameAs(const char *String,
00367 const char *String_lengths,
00368 VIABLE_CHOICE ViableChoice);
00376 int UniformCertainties(const BLOB_CHOICE_LIST_VECTOR &Choices,
00377 const WERD_CHOICE &BestChoice);
00379 bool AcceptableChoice(BLOB_CHOICE_LIST_VECTOR *Choices,
00380 WERD_CHOICE *BestChoice,
00381 DANGERR *fixpt,
00382 ACCEPTABLE_CHOICE_CALLER caller,
00383 bool *modified_blobs);
00387 bool AcceptableResult(const WERD_CHOICE &BestChoice);
00390 int ChoiceSameAs(const WERD_CHOICE &WordChoice, VIABLE_CHOICE ViableChoice);
00398 void LogNewChoice(FLOAT32 AdjustFactor, const float Certainties[],
00399 bool raw_choice, WERD_CHOICE *WordChoice);
00400 void EndDangerousAmbigs();
00402 bool CurrentBestChoiceIs(const WERD_CHOICE &WordChoice);
00404 FLOAT32 CurrentBestChoiceAdjustFactor();
00406 bool CurrentWordAmbig();
00408 void DebugWordChoices();
00410 void PrintAmbigAlternatives(FILE *file, const char *label,
00411 int label_num_unichars);
00414 void FillViableChoice(const WERD_CHOICE &WordChoice,
00415 FLOAT32 AdjustFactor, const float Certainties[],
00416 VIABLE_CHOICE ViableChoice);
00419 bool AlternativeChoicesWorseThan(FLOAT32 Threshold);
00422 void FilterWordChoices();
00437 void FindClassifierErrors(FLOAT32 MinRating,
00438 FLOAT32 MaxRating,
00439 FLOAT32 RatingMargin,
00440 FLOAT32 Thresholds[]);
00443 void InitChoiceAccum();
00445 void ClearBestChoiceAccum();
00449 void LogNewSegmentation(PIECES_STATE BlobWidth);
00452 void LogNewSplit(int Blob);
00455 void AddNewChunk(VIABLE_CHOICE Choice, int Blob);
00457 void SettupStopperPass1();
00459 void SettupStopperPass2();
00460
00462 int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset);
00465 bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset);
00466
00467
00468
00471 void Load();
00472 void End();
00473
00474
00475 void ResetDocumentDictionary() {
00476 if (pending_words_ != NULL)
00477 pending_words_->clear();
00478 if (document_words_ != NULL)
00479 document_words_->clear();
00480 }
00481
00482
00483
00484
00485 void LoadEquivalenceList(const char *unichar_strings[]);
00486
00487
00488
00489 UNICHAR_ID NormalizeUnicharIdForMatch(UNICHAR_ID unichar_id) const;
00490
00553
00554 int def_letter_is_okay(void* void_dawg_args,
00555 UNICHAR_ID unichar_id, bool word_end) const;
00556
00557 int (Dict::*letter_is_okay_)(void* void_dawg_args,
00558 UNICHAR_ID unichar_id, bool word_end) const;
00560 int LetterIsOkay(void* void_dawg_args,
00561 UNICHAR_ID unichar_id, bool word_end) const {
00562 return (this->*letter_is_okay_)(void_dawg_args, unichar_id, word_end);
00563 }
00564
00565
00567 double (Dict::*probability_in_context_)(const char* lang,
00568 const char* context,
00569 int context_bytes,
00570 const char* character,
00571 int character_bytes);
00573 double ProbabilityInContext(const char* context,
00574 int context_bytes,
00575 const char* character,
00576 int character_bytes) {
00577 return (this->*probability_in_context_)(
00578 getImage()->getCCUtil()->lang.string(),
00579 context, context_bytes,
00580 character, character_bytes);
00581 }
00582
00584 double def_probability_in_context(
00585 const char* lang, const char* context, int context_bytes,
00586 const char* character, int character_bytes) {
00587 (void) context;
00588 (void) context_bytes;
00589 (void) character;
00590 (void) character_bytes;
00591 return 0.0;
00592 }
00593 double ngram_probability_in_context(const char* lang,
00594 const char* context,
00595 int context_bytes,
00596 const char* character,
00597 int character_bytes);
00598
00600 inline const int NumDawgs() const { return dawgs_.size(); }
00602 inline const Dawg *GetDawg(int index) const { return dawgs_[index]; }
00604 inline const Dawg *GetPuncDawg() const { return punc_dawg_; }
00606 inline const Dawg *GetUnambigDawg() const { return unambig_dawg_; }
00608 inline const Dawg *GetFixedLengthDawg(int word_length) const {
00609 if (word_length > max_fixed_length_dawgs_wdlen_) return NULL;
00610 assert(dawgs_.size() > word_length);
00611 return dawgs_[word_length];
00612 }
00613 inline const int GetMaxFixedLengthDawgIndex() const {
00614 return max_fixed_length_dawgs_wdlen_;
00615 }
00617 static inline NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref) {
00618 if (edge_ref == NO_EDGE) return 0;
00619 NODE_REF node = dawg->next_node(edge_ref);
00620 if (node == 0) node = NO_EDGE;
00621 return node;
00622 }
00628 inline bool ConstraintsOk(const DawgInfoVector &constraints,
00629 int word_end, DawgType current_dawg_type) const {
00630 if (!word_end) return true;
00631 if (current_dawg_type == DAWG_TYPE_PUNCTUATION) return true;
00632 for (int c = 0; c < constraints.length(); ++c) {
00633 const DawgInfo &cinfo = constraints[c];
00634 Dawg *cdawg = dawgs_[cinfo.dawg_index];
00635 if (!cdawg->end_of_word(cinfo.ref)) {
00636 if (dawg_debug_level >= 3) {
00637 tprintf("Constraint [%d, " REFFORMAT "] is not satisfied\n",
00638 cinfo.dawg_index, cinfo.ref);
00639 }
00640 return false;
00641 }
00642 }
00643 return true;
00644 }
00645
00651 void ProcessPatternEdges(const Dawg *dawg, const DawgInfo &info,
00652 UNICHAR_ID unichar_id, bool word_end,
00653 DawgArgs *dawg_args,
00654 PermuterType *current_permuter) const;
00655
00659
00665 static void ReadFixedLengthDawgs(DawgType type, const STRING &lang,
00666 PermuterType perm, int debug_level,
00667 FILE *file, DawgVector *dawg_vec,
00668 int *max_wdlen);
00671 static void WriteFixedLengthDawgs(
00672 const GenericVector<SquishedDawg *> &dawg_vec,
00673 int num_dawgs, int debug_level, FILE *output_file);
00674
00676 inline static bool valid_word_permuter(uinT8 perm, bool numbers_ok) {
00677 return (perm == SYSTEM_DAWG_PERM || perm == FREQ_DAWG_PERM ||
00678 perm == DOC_DAWG_PERM || perm == USER_DAWG_PERM ||
00679 perm == USER_PATTERN_PERM || (numbers_ok && perm == NUMBER_PERM));
00680 }
00681 int valid_word(const WERD_CHOICE &word, bool numbers_ok) const;
00682 int valid_word(const WERD_CHOICE &word) const {
00683 return valid_word(word, false);
00684 }
00685 int valid_word_or_number(const WERD_CHOICE &word) const {
00686 return valid_word(word, true);
00687 }
00689 int valid_word(const char *string) const {
00690 WERD_CHOICE word(string, getUnicharset());
00691 return valid_word(word);
00692 }
00693
00694 bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const;
00699 bool valid_punctuation(const WERD_CHOICE &word);
00701 int good_choice(const WERD_CHOICE &choice);
00703 void add_document_word(const WERD_CHOICE &best_choice);
00704 int get_top_word_script(const BLOB_CHOICE_LIST_VECTOR &char_choices,
00705 const UNICHARSET &unicharset);
00707 void adjust_word(WERD_CHOICE *word, float *certainty_array,
00708 const BLOB_CHOICE_LIST_VECTOR *char_choices,
00709 bool nonword, float additional_adjust, bool debug);
00710 void adjust_word(WERD_CHOICE *word, float *certainty_array, bool debug) {
00711 adjust_word(word, certainty_array, NULL, false, 0.0f, debug);
00712 }
00713 void adjust_non_word(WERD_CHOICE *word, float *certainty_array, bool debug) {
00714 adjust_word(word, certainty_array, NULL, true, 0.0f, debug);
00715 }
00717 inline void SetWordsegRatingAdjustFactor(float f) {
00718 wordseg_rating_adjust_factor_ = f;
00719 }
00720
00721 const LIST &getBestChoices() { return best_choices_; }
00722
00723 private:
00725 Image* image_ptr_;
00732 UnicharAmbigs *dang_ambigs_table_;
00734 UnicharAmbigs *replace_ambigs_table_;
00739 bool keep_word_choices_;
00741 FLOAT32 reject_offset_;
00743 PIECES_STATE current_segmentation_;
00745 VIABLE_CHOICE best_raw_choice_;
00746 LIST raw_choices_;
00747 LIST best_choices_;
00748
00749 UNICHAR_ID hyphen_unichar_id_;
00750 WERD_CHOICE *hyphen_word_;
00751 DawgInfoVector hyphen_active_dawgs_;
00752 DawgInfoVector hyphen_constraints_;
00753 bool last_word_on_line_;
00754
00755
00756
00757
00758 GenericVector<GenericVectorEqEq<UNICHAR_ID> > equivalent_symbols_;
00759
00760 DawgVector dawgs_;
00761 SuccessorListsVector successors_;
00762 Trie *pending_words_;
00763
00764
00765
00766
00767
00768 Dawg *bigram_dawg_;
00771
00772
00773 Dawg *freq_dawg_;
00774 Dawg *unambig_dawg_;
00775 Dawg *punc_dawg_;
00776 Trie *document_words_;
00779 int max_fixed_length_dawgs_wdlen_;
00782 float wordseg_rating_adjust_factor_;
00783
00784 FILE *output_ambig_words_file_;
00785
00786 public:
00790 STRING_VAR_H(user_words_suffix, "", "A list of user-provided words.");
00791 STRING_VAR_H(user_patterns_suffix, "",
00792 "A list of user-provided patterns.");
00793 BOOL_VAR_H(load_system_dawg, true, "Load system word dawg.");
00794 BOOL_VAR_H(load_freq_dawg, true, "Load frequent word dawg.");
00795 BOOL_VAR_H(load_unambig_dawg, true, "Load unambiguous word dawg.");
00796 BOOL_VAR_H(load_punc_dawg, true,
00797 "Load dawg with punctuation patterns.");
00798 BOOL_VAR_H(load_number_dawg, true, "Load dawg with number patterns.");
00799 BOOL_VAR_H(load_fixed_length_dawgs, true, "Load fixed length"
00800 " dawgs (e.g. for non-space delimited languages)");
00801 BOOL_VAR_H(load_bigram_dawg, false,
00802 "Load dawg with special word bigrams.");
00803 double_VAR_H(segment_penalty_dict_frequent_word, 1.0,
00804 "Score multiplier for word matches which have good case and"
00805 "are frequent in the given language (lower is better).");
00806
00807 double_VAR_H(segment_penalty_dict_case_ok, 1.1,
00808 "Score multiplier for word matches that have good case "
00809 "(lower is better).");
00810
00811 double_VAR_H(segment_penalty_dict_case_bad, 1.3125,
00812 "Default score multiplier for word matches, which may have "
00813 "case issues (lower is better).");
00814
00815
00816 double_VAR_H(segment_penalty_ngram_best_choice, 1.24,
00817 "Multipler to for the best choice from the ngram model.");
00818
00819 double_VAR_H(segment_penalty_dict_nonword, 1.25,
00820 "Score multiplier for glyph fragment segmentations which "
00821 "do not match a dictionary word (lower is better).");
00822
00823 double_VAR_H(segment_penalty_garbage, 1.50,
00824 "Score multiplier for poorly cased strings that are not in"
00825 " the dictionary and generally look like garbage (lower is"
00826 " better).");
00827 STRING_VAR_H(output_ambig_words_file, "",
00828 "Output file for ambiguities found in the dictionary");
00829 INT_VAR_H(dawg_debug_level, 0, "Set to 1 for general debug info"
00830 ", to 2 for more details, to 3 to see all the debug messages");
00831 INT_VAR_H(hyphen_debug_level, 0, "Debug level for hyphenated words.");
00832 INT_VAR_H(max_viterbi_list_size, 10, "Maximum size of viterbi list.");
00833 BOOL_VAR_H(use_only_first_uft8_step, false,
00834 "Use only the first UTF8 step of the given string"
00835 " when computing log probabilities.");
00836 double_VAR_H(certainty_scale, 20.0, "Certainty scaling factor");
00837 double_VAR_H(stopper_nondict_certainty_base, -2.50,
00838 "Certainty threshold for non-dict words");
00839 double_VAR_H(stopper_phase2_certainty_rejection_offset, 1.0,
00840 "Reject certainty offset");
00841 INT_VAR_H(stopper_smallword_size, 2,
00842 "Size of dict word to be treated as non-dict word");
00843 double_VAR_H(stopper_certainty_per_char, -0.50,
00844 "Certainty to add for each dict char above small word size.");
00845 double_VAR_H(stopper_allowable_character_badness, 3.0,
00846 "Max certaintly variation allowed in a word (in sigma)");
00847 INT_VAR_H(stopper_debug_level, 0, "Stopper debug level");
00848 BOOL_VAR_H(stopper_no_acceptable_choices, false,
00849 "Make AcceptableChoice() always return false. Useful"
00850 " when there is a need to explore all segmentations");
00851 double_VAR_H(stopper_ambiguity_threshold_gain, 8.0,
00852 "Gain factor for ambiguity threshold.");
00853 double_VAR_H(stopper_ambiguity_threshold_offset, 1.5,
00854 "Certainty offset for ambiguity threshold.");
00855 BOOL_VAR_H(save_raw_choices, false, "Save all explored raw choices");
00856 INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
00857 STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
00858 " should be printed to stdout");
00859 STRING_VAR_H(word_to_debug_lengths, "",
00860 "Lengths of unichars in word_to_debug");
00861 INT_VAR_H(fragments_debug, 0, "Debug character fragments");
00862 INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
00863 BOOL_VAR_H(permute_debug, 0, "Debug char permutation process");
00864 double_VAR_H(bestrate_pruning_factor, 2.0, "Multiplying factor of"
00865 " current best rate to prune other hypotheses");
00866 BOOL_VAR_H(permute_script_word, 0,
00867 "Turn on word script consistency permuter");
00868 BOOL_VAR_H(segment_segcost_rating, 0,
00869 "incorporate segmentation cost in word rating?");
00870 BOOL_VAR_H(segment_nonalphabetic_script, false,
00871 "Don't use any alphabetic-specific tricks."
00872 "Set to true in the traineddata config file for"
00873 " scripts that are cursive or inherently fixed-pitch");
00874 double_VAR_H(segment_reward_script, 0.95,
00875 "Score multipler for script consistency within a word. "
00876 "Being a 'reward' factor, it should be <= 1. "
00877 "Smaller value implies bigger reward.");
00878 BOOL_VAR_H(permute_fixed_length_dawg, 0,
00879 "Turn on fixed-length phrasebook search permuter");
00880 BOOL_VAR_H(permute_chartype_word, 0,
00881 "Turn on character type (property) consistency permuter");
00882 double_VAR_H(segment_reward_chartype, 0.97,
00883 "Score multipler for char type consistency within a word. ");
00884
00885 double_VAR_H(segment_reward_ngram_best_choice, 0.99,
00886 "Score multipler for ngram permuter's best choice"
00887 " (only used in the Han script path).");
00888 BOOL_VAR_H(save_doc_words, 0, "Save Document Words");
00889 BOOL_VAR_H(doc_dict_enable, 1, "Enable Document Dictionary ");
00890 double_VAR_H(doc_dict_pending_threshold, 0.0,
00891 "Worst certainty for using pending dictionary");
00892 double_VAR_H(doc_dict_certainty_threshold, -2.25, "Worst certainty"
00893 " for words that can be inserted into the document dictionary");
00894 BOOL_VAR_H(ngram_permuter_activated, false,
00895 "Activate character-level n-gram-based permuter");
00896 INT_VAR_H(max_permuter_attempts, 10000, "Maximum number of different"
00897 " character choices to consider during permutation."
00898 " This limit is especially useful when user patterns"
00899 " are specified, since overly generic patterns can result in"
00900 " dawg search exploring an overly large number of options.");
00901 BOOL_VAR_H(permute_only_top, false, "Run only the top choice permuter");
00902 };
00903 }
00904
00905 #endif // THIRD_PARTY_TESSERACT_DICT_DICT_H_