tesseract::LanguageModel Class Reference

#include <language_model.h>

List of all members.

Public Member Functions

 LanguageModel (const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
 ~LanguageModel ()
void InitForWord (const WERD_CHOICE *prev_word, bool fixed_pitch, float best_choice_cert, float max_char_wh_ratio, float rating_cert_scale, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BlamerBundle *blamer_bundle, bool debug_blamer)
void CleanUp ()
void DeleteState (BLOB_CHOICE_LIST *choices)
LanguageModelFlagsType UpdateState (LanguageModelFlagsType changed, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE_LIST *parent_list, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void GenerateNgramModelPainPointsFromColumn (int col, int row, HEAP *pain_points, CHUNKS_RECORD *chunks_record)
void GenerateProblematicPathPainPointsFromColumn (int col, int row, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
void GeneratePainPointsFromColumn (int col, const GenericVector< int > &non_empty_rows, float best_choice_cert, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record)
void GeneratePainPointsFromBestChoice (HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle)
bool GeneratePainPoint (int col, int row, bool ok_to_extend, float priority_adjustment, float worst_piece_cert, bool fragmented, float best_choice_cert, float max_char_wh_ratio, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, HEAP *pain_points)
bool AcceptableChoiceFound ()
void GetWorstPieceCertainty (int col, int row, MATRIX *ratings, float *cert, bool *fragmented)
float ComputeOutlineLength (BLOB_CHOICE *b)

Public Attributes

int language_model_debug_level = 0
bool language_model_ngram_on = false
int language_model_ngram_order = 8
int language_model_viterbi_list_max_num_prunable = 10
int language_model_viterbi_list_max_size = 500
double language_model_ngram_small_prob = 0.000001
double language_model_ngram_nonmatch_score = -40.0
bool language_model_ngram_use_only_first_uft8_step = false
double language_model_ngram_scale_factor = 0.03
bool language_model_ngram_space_delimited_language = true
int language_model_min_compound_length = 3
int language_model_fixed_length_choices_depth = 3
double language_model_penalty_non_freq_dict_word = 0.1
double language_model_penalty_non_dict_word = 0.15
double language_model_penalty_punc = 0.2
double language_model_penalty_case = 0.1
double language_model_penalty_script = 0.5
double language_model_penalty_chartype = 0.3
double language_model_penalty_font = 0.00
double language_model_penalty_spacing = 0.05
double language_model_penalty_increment = 0.01
bool language_model_use_sigmoidal_certainty = false

Static Public Attributes

static const float kInitialPainPointPriorityAdjustment = 5.0f
static const float kDefaultPainPointPriorityAdjustment = 2.0f
static const float kBestChoicePainPointPriorityAdjustment = 0.5f
static const float kCriticalPainPointPriorityAdjustment = 0.1f
static const float kMaxAvgNgramCost = 25.0f
static const int kMinFixedLengthDawgLength = 2
static const float kLooseMaxCharWhRatio = 2.5f
static const LanguageModelFlagsType kSmallestRatingFlag = 0x1
static const LanguageModelFlagsType kLowerCaseFlag = 0x2
static const LanguageModelFlagsType kUpperCaseFlag = 0x4
static const LanguageModelFlagsType kConsistentFlag = 0x8
static const LanguageModelFlagsType kDawgFlag = 0x10
static const LanguageModelFlagsType kNgramFlag = 0x20
static const LanguageModelFlagsType kJustClassifiedFlag = 0x80
static const LanguageModelFlagsType kAllChangedFlag = 0xff

Protected Member Functions

float CertaintyScore (float cert)
bool NonAlphaOrDigitMiddle (int col, int row, int dimension, UNICHAR_ID unichar_id)
bool IsFragment (BLOB_CHOICE *b)
bool IsHan (int script_id)
void GetPieceCertainty (BLOB_CHOICE_LIST *blist, float *cert, bool *fragmented)
float ComputeAdjustment (int num_problems, float penalty)
float ComputeConsistencyAdjustment (const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
float ComputeConsistencyAdjustedRatingsSum (float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info)
float ComputeAdjustedPathCost (float ratings_sum, int length, float dawg_score, const LanguageModelDawgInfo *dawg_info, const LanguageModelNgramInfo *ngram_info, const LanguageModelConsistencyInfo &consistency_info, const AssociateStats &associate_stats, ViterbiStateEntry *parent_vse)
bool ProblematicPath (const ViterbiStateEntry &vse, UNICHAR_ID unichar_id, bool word_end)
void GetTopChoiceLowerUpper (LanguageModelFlagsType changed, BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper)
LanguageModelFlagsType AddViterbiStateEntry (LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, BLOB_CHOICE *parent_b, ViterbiStateEntry *parent_vse, HEAP *pain_points, BestPathByColumn *best_path_by_column[], CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void PrintViterbiStateEntry (const char *msg, ViterbiStateEntry *vse, BLOB_CHOICE *b, CHUNKS_RECORD *chunks_record)
void GenerateTopChoiceInfo (float ratings_sum, const LanguageModelDawgInfo *dawg_info, const LanguageModelConsistencyInfo &consistency_info, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *b, LanguageModelFlagsType *top_choice_flags, LanguageModelFlagsType *changed)
LanguageModelDawgInfoGenerateDawgInfo (bool word_end, int script_id, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse, LanguageModelFlagsType *changed)
LanguageModelNgramInfoGenerateNgramInfo (const char *unichar, float certainty, float denom, int curr_col, int curr_row, const ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, LanguageModelFlagsType *changed)
float ComputeNgramCost (const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
float ComputeDenom (BLOB_CHOICE_LIST *curr_list)
void FillConsistencyInfo (int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, BLOB_CHOICE *parent_b, CHUNKS_RECORD *chunks_record, LanguageModelConsistencyInfo *consistency_info)
void UpdateBestChoice (BLOB_CHOICE *b, ViterbiStateEntry *vse, HEAP *pain_points, CHUNKS_RECORD *chunks_record, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
void ExtractRawFeaturesFromPath (const ViterbiStateEntry &vse, float *features)
WERD_CHOICEConstructWord (BLOB_CHOICE *b, ViterbiStateEntry *vse, CHUNKS_RECORD *chunks_record, BLOB_CHOICE_LIST_VECTOR *best_char_choices, float certainties[], float *dawg_score, STATE *state, BlamerBundle *blamer_bundle, bool *truth_path)
void UpdateCoveredByFixedLengthDawgs (const DawgInfoVector &active_dawgs, int word_index, int word_length, int *skip, int *covered, float *dawg_score, bool *dawg_score_done)
void ComputeAssociateStats (int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, CHUNKS_RECORD *chunks_record, AssociateStats *associate_stats)
bool PrunablePath (LanguageModelFlagsType top_choice_flags, const LanguageModelDawgInfo *dawg_info)
bool AcceptablePath (const ViterbiStateEntry &vse)

Protected Attributes

DawgArgsdawg_args_
GenericVector< bool * > updated_flags_
float rating_cert_scale_
const UnicityTable< FontInfo > * fontinfo_table_
Dictdict_
bool fixed_pitch_
float max_char_wh_ratio_
STRING prev_word_str_
int prev_word_unichar_step_len_
DawgInfoVectorbeginning_active_dawgs_
DawgInfoVectorbeginning_constraints_
DawgInfoVectorfixed_length_beginning_active_dawgs_
DawgInfoVectorempty_dawg_info_vec_
float max_penalty_adjust_
bool acceptable_choice_found_
bool correct_segmentation_explored_

Constructor & Destructor Documentation

tesseract::LanguageModel::LanguageModel ( const UnicityTable< FontInfo > *  fontinfo_table,
Dict dict 
)
tesseract::LanguageModel::~LanguageModel (  ) 

Member Function Documentation

bool tesseract::LanguageModel::AcceptableChoiceFound (  )  [inline]
bool tesseract::LanguageModel::AcceptablePath ( const ViterbiStateEntry vse  )  [inline, protected]
LanguageModelFlagsType tesseract::LanguageModel::AddViterbiStateEntry ( LanguageModelFlagsType  top_choice_flags,
float  denom,
bool  word_end,
int  curr_col,
int  curr_row,
BLOB_CHOICE b,
BLOB_CHOICE parent_b,
ViterbiStateEntry parent_vse,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
) [protected]
float tesseract::LanguageModel::CertaintyScore ( float  cert  )  [inline, protected]
void tesseract::LanguageModel::CleanUp (  ) 
float tesseract::LanguageModel::ComputeAdjustedPathCost ( float  ratings_sum,
int  length,
float  dawg_score,
const LanguageModelDawgInfo dawg_info,
const LanguageModelNgramInfo ngram_info,
const LanguageModelConsistencyInfo consistency_info,
const AssociateStats associate_stats,
ViterbiStateEntry parent_vse 
) [protected]
float tesseract::LanguageModel::ComputeAdjustment ( int  num_problems,
float  penalty 
) [inline, protected]
void tesseract::LanguageModel::ComputeAssociateStats ( int  col,
int  row,
float  max_char_wh_ratio,
ViterbiStateEntry parent_vse,
CHUNKS_RECORD chunks_record,
AssociateStats associate_stats 
) [inline, protected]
float tesseract::LanguageModel::ComputeConsistencyAdjustedRatingsSum ( float  ratings_sum,
const LanguageModelDawgInfo dawg_info,
const LanguageModelConsistencyInfo consistency_info 
) [inline, protected]
float tesseract::LanguageModel::ComputeConsistencyAdjustment ( const LanguageModelDawgInfo dawg_info,
const LanguageModelConsistencyInfo consistency_info 
) [inline, protected]
float tesseract::LanguageModel::ComputeDenom ( BLOB_CHOICE_LIST *  curr_list  )  [protected]
float tesseract::LanguageModel::ComputeNgramCost ( const char *  unichar,
float  certainty,
float  denom,
const char *  context,
int *  unichar_step_len,
bool *  found_small_prob,
float *  ngram_prob 
) [protected]
float tesseract::LanguageModel::ComputeOutlineLength ( BLOB_CHOICE b  )  [inline]
WERD_CHOICE * tesseract::LanguageModel::ConstructWord ( BLOB_CHOICE b,
ViterbiStateEntry vse,
CHUNKS_RECORD chunks_record,
BLOB_CHOICE_LIST_VECTOR best_char_choices,
float  certainties[],
float *  dawg_score,
STATE state,
BlamerBundle blamer_bundle,
bool *  truth_path 
) [protected]
void tesseract::LanguageModel::DeleteState ( BLOB_CHOICE_LIST *  choices  ) 
void tesseract::LanguageModel::ExtractRawFeaturesFromPath ( const ViterbiStateEntry vse,
float *  features 
) [protected]
void tesseract::LanguageModel::FillConsistencyInfo ( int  curr_col,
bool  word_end,
BLOB_CHOICE b,
ViterbiStateEntry parent_vse,
BLOB_CHOICE parent_b,
CHUNKS_RECORD chunks_record,
LanguageModelConsistencyInfo consistency_info 
) [protected]
LanguageModelDawgInfo * tesseract::LanguageModel::GenerateDawgInfo ( bool  word_end,
int  script_id,
int  curr_col,
int  curr_row,
const BLOB_CHOICE b,
const ViterbiStateEntry parent_vse,
LanguageModelFlagsType changed 
) [protected]
LanguageModelNgramInfo * tesseract::LanguageModel::GenerateNgramInfo ( const char *  unichar,
float  certainty,
float  denom,
int  curr_col,
int  curr_row,
const ViterbiStateEntry parent_vse,
BLOB_CHOICE parent_b,
LanguageModelFlagsType changed 
) [protected]
void tesseract::LanguageModel::GenerateNgramModelPainPointsFromColumn ( int  col,
int  row,
HEAP pain_points,
CHUNKS_RECORD chunks_record 
)
bool tesseract::LanguageModel::GeneratePainPoint ( int  col,
int  row,
bool  ok_to_extend,
float  priority_adjustment,
float  worst_piece_cert,
bool  fragmented,
float  best_choice_cert,
float  max_char_wh_ratio,
BLOB_CHOICE parent_b,
ViterbiStateEntry parent_vse,
CHUNKS_RECORD chunks_record,
HEAP pain_points 
)
void tesseract::LanguageModel::GeneratePainPointsFromBestChoice ( HEAP pain_points,
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle 
)
void tesseract::LanguageModel::GeneratePainPointsFromColumn ( int  col,
const GenericVector< int > &  non_empty_rows,
float  best_choice_cert,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record 
)
void tesseract::LanguageModel::GenerateProblematicPathPainPointsFromColumn ( int  col,
int  row,
float  best_choice_cert,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record 
)
void tesseract::LanguageModel::GenerateTopChoiceInfo ( float  ratings_sum,
const LanguageModelDawgInfo dawg_info,
const LanguageModelConsistencyInfo consistency_info,
const ViterbiStateEntry parent_vse,
BLOB_CHOICE b,
LanguageModelFlagsType top_choice_flags,
LanguageModelFlagsType changed 
) [protected]
void tesseract::LanguageModel::GetPieceCertainty ( BLOB_CHOICE_LIST *  blist,
float *  cert,
bool *  fragmented 
) [inline, protected]
void tesseract::LanguageModel::GetTopChoiceLowerUpper ( LanguageModelFlagsType  changed,
BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE **  first_lower,
BLOB_CHOICE **  first_upper 
) [protected]
void tesseract::LanguageModel::GetWorstPieceCertainty ( int  col,
int  row,
MATRIX ratings,
float *  cert,
bool *  fragmented 
) [inline]
void tesseract::LanguageModel::InitForWord ( const WERD_CHOICE prev_word,
bool  fixed_pitch,
float  best_choice_cert,
float  max_char_wh_ratio,
float  rating_cert_scale,
HEAP pain_points,
CHUNKS_RECORD chunks_record,
BlamerBundle blamer_bundle,
bool  debug_blamer 
)
bool tesseract::LanguageModel::IsFragment ( BLOB_CHOICE b  )  [inline, protected]
bool tesseract::LanguageModel::IsHan ( int  script_id  )  [inline, protected]
bool tesseract::LanguageModel::NonAlphaOrDigitMiddle ( int  col,
int  row,
int  dimension,
UNICHAR_ID  unichar_id 
) [inline, protected]
void tesseract::LanguageModel::PrintViterbiStateEntry ( const char *  msg,
ViterbiStateEntry vse,
BLOB_CHOICE b,
CHUNKS_RECORD chunks_record 
) [protected]
bool tesseract::LanguageModel::ProblematicPath ( const ViterbiStateEntry vse,
UNICHAR_ID  unichar_id,
bool  word_end 
) [protected]
bool tesseract::LanguageModel::PrunablePath ( LanguageModelFlagsType  top_choice_flags,
const LanguageModelDawgInfo dawg_info 
) [inline, protected]
void tesseract::LanguageModel::UpdateBestChoice ( BLOB_CHOICE b,
ViterbiStateEntry vse,
HEAP pain_points,
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
) [protected]
void tesseract::LanguageModel::UpdateCoveredByFixedLengthDawgs ( const DawgInfoVector active_dawgs,
int  word_index,
int  word_length,
int *  skip,
int *  covered,
float *  dawg_score,
bool *  dawg_score_done 
) [protected]
LanguageModelFlagsType tesseract::LanguageModel::UpdateState ( LanguageModelFlagsType  changed,
int  curr_col,
int  curr_row,
BLOB_CHOICE_LIST *  curr_list,
BLOB_CHOICE_LIST *  parent_list,
HEAP pain_points,
BestPathByColumn best_path_by_column[],
CHUNKS_RECORD chunks_record,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Member Data Documentation

const float tesseract::LanguageModel::kMaxAvgNgramCost = 25.0f [static]

"Language model debug level"

"Depth of blob choice lists to explore" " when fixed length dawgs are on"

"Minimum length of compound words"

"Average classifier score of a non-matching unichar"

"Turn on/off the use of character ngram model"

"Maximum order of the character ngram model"

"Strength of the character ngram model relative to the" " character classifier "

"To avoid overly small denominators use this as the floor" " of the probability returned by the ngram model"

"Words are delimited by space"

"Use only the first UTF8 step of the given string" " when computing log probabilities"

"Penalty for inconsistent case"

"Penalty for inconsistent character type"

"Penalty for inconsistent font"

"Penalty increment"

"Penalty for non-dictionary words"

"Penalty for words not in the frequent word dictionary"

"Penalty for inconsistent punctuation"

"Penalty for inconsistent script"

"Penalty for inconsistent spacing"

"Use sigmoidal score for certainty"

"Maximum number of prunable (those for which PrunablePath() is true)" "entries in each viterbi list recorded in BLOB_CHOICEs"

"Maximum size of viterbi lists recorded in BLOB_CHOICEs"


The documentation for this class was generated from the following files:
Generated on Thu Feb 2 08:19:27 2012 for Tesseract by  doxygen 1.6.3