tesseract::Classify Class Reference

#include <classify.h>

Inheritance diagram for tesseract::Classify:
tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Wordrec tesseract::Tesseract

List of all members.

Public Member Functions

 Classify ()
virtual ~Classify ()
DictgetDict ()
const ShapeTableshape_table () const
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, CP_RESULT_STRUCT *results)
void ReadNewCutoffs (FILE *CutoffFile, bool swap, inT64 end_offset, CLASS_CUTOFF_ARRAY Cutoffs)
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
ADAPT_TEMPLATES ReadAdaptedTemplates (FILE *File)
FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
void FreeNormProtos ()
NORM_PROTOSReadNormProtos (FILE *File, inT64 end_offset)
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
void LearnWord (const char *filename, const char *rejmap, WERD_RES *word)
void LearnPieces (const char *filename, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
void InitAdaptiveClassifier (bool load_pre_trained_templates)
void InitAdaptedClass (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
void AdaptToPunc (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
void AmbigClassifier (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_CLASS *Classes, UNICHAR_ID *Ambiguities, ADAPT_RESULTS *Results)
void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int num_classes, const TBOX &blob_box, CLASS_PRUNER_RESULTS results, ADAPT_RESULTS *final_results)
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, const uinT8 *cn_factors, INT_RESULT_STRUCT &int_result, ADAPT_RESULTS *final_results)
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, const uinT8 *cn_factors)
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
void AddNewResult (ADAPT_RESULTS *results, CLASS_ID class_id, int shape_id, FLOAT32 rating, bool adapted, int config, int fontinfo_id, int fontinfo_id2)
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
void DebugAdaptiveClassifier (TBLOB *Blob, const DENORM &denorm, ADAPT_RESULTS *Results)
void GetAdaptThresholds (TWERD *Word, const DENORM &denorm, const WERD_CHOICE &BestChoice, const WERD_CHOICE &BestRawChoice, FLOAT32 Thresholds[])
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, const DENORM &denorm, TBLOB *Blob)
void PrintAdaptiveMatchResults (FILE *File, ADAPT_RESULTS *Results)
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
void RemoveBadMatches (ADAPT_RESULTS *Results)
void SetAdaptiveThreshold (FLOAT32 Threshold)
void ShowBestMatchFor (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int shape_id, BOOL8 AdaptiveOn, BOOL8 PreTrainedOn, ADAPT_RESULTS *Results)
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
int ShapeIDToClassID (int shape_id) const
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const DENORM &denorm, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
int CharNormClassifier (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, ADAPT_RESULTS *Results)
int CharNormTrainingSample (bool pruner_only, const TrainingSample &sample, GenericVector< ShapeRating > *results)
UNICHAR_IDGetAmbiguities (TBLOB *Blob, const DENORM &denorm, CLASS_ID CorrectClass)
void DoAdaptiveMatch (TBLOB *Blob, const DENORM &denorm, ADAPT_RESULTS *Results)
void AdaptToChar (TBLOB *Blob, const DENORM &denorm, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold)
void DisplayAdaptedChar (TBLOB *blob, const DENORM &denorm, INT_CLASS_STRUCT *int_class)
int AdaptableWord (TWERD *Word, const WERD_CHOICE &BestChoiceWord, const WERD_CHOICE &RawChoiceWord)
void EndAdaptiveClassifier ()
void PrintAdaptiveStatistics (FILE *File)
void SettupPass1 ()
void SettupPass2 ()
void AdaptiveClassifier (TBLOB *Blob, const DENORM &denorm, BLOB_CHOICE_LIST *Choices, CLASS_PRUNER_RESULTS cp_results)
void ClassifyAsNoise (ADAPT_RESULTS *Results)
void ResetAdaptiveClassifierInternal ()
int GetBaselineFeatures (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 *CharNormArray, inT32 *BlobLength)
int GetCharNormFeatures (TBLOB *Blob, const DENORM &denorm, INT_TEMPLATES Templates, INT_FEATURE_ARRAY IntFeatures, uinT8 *PrunerNormArray, uinT8 *CharNormArray, inT32 *BlobLength, inT32 *FeatureOutlineIndex)
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
void UpdateAmbigsGroup (CLASS_ID class_id, const DENORM &denorm, TBLOB *Blob)
void ResetFeaturesHaveBeenExtracted ()
bool AdaptiveClassifierIsFull ()
bool LooksLikeGarbage (const DENORM &denorm, TBLOB *blob)
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
void ClearCharNormArray (uinT8 *char_norm_array)
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
INT_TEMPLATES ReadIntTemplates (FILE *File)
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
void ShowMatchDisplay ()
UnicityTable< FontInfo > & get_fontinfo_table ()
UnicityTable< FontSet > & get_fontset_table ()
void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
ReadClassFile

Read in the training data from a file. All of the classes are read in. The results are stored in the global variable, 'TrainingData'.

void ReadClassFile ()

Public Attributes

bool prioritize_division = FALSE
int tessedit_single_match = FALSE
bool classify_enable_learning = true
int classify_debug_level = 0
int classify_norm_method = character
double classify_char_norm_range = 0.2
double classify_min_norm_scale_x = 0.0
double classify_max_norm_scale_x = 0.325
double classify_min_norm_scale_y = 0.0
double classify_max_norm_scale_y = 0.325
bool tess_cn_matching = 0
bool tess_bn_matching = 0
bool classify_enable_adaptive_matcher = 1
bool classify_use_pre_adapted_templates = 0
bool classify_save_adapted_templates = 0
bool classify_enable_adaptive_debugger = 0
int matcher_debug_level = 0
int matcher_debug_flags = 0
int classify_learning_debug_level = 0
double matcher_good_threshold = 0.125
double matcher_great_threshold = 0.0
double matcher_perfect_threshold = 0.02
double matcher_bad_match_pad = 0.15
double matcher_rating_margin = 0.1
double matcher_avg_noise_size = 12.0
int matcher_permanent_classes_min = 1
int matcher_min_examples_for_prototyping = 3
int matcher_sufficient_examples_for_prototyping = 5
double matcher_clustering_max_angle_delta = 0.015
double classify_misfit_junk_penalty = 0.0
double rating_scale = 1.5
double certainty_scale = 20.0
double tessedit_class_miss_scale = 0.00390625
int classify_adapt_proto_threshold = 230
int classify_adapt_feature_threshold = 230
bool disable_character_fragments = TRUE
double classify_character_fragments_garbage_certainty_threshold = -3.0
bool classify_debug_character_fragments = FALSE
bool matcher_debug_separate_windows = FALSE
char * classify_learn_debug_str = ""
int classify_class_pruner_threshold = 229
int classify_class_pruner_multiplier = 30
int classify_cp_cutoff_strength = 7
int classify_integer_matcher_multiplier = 14
INT_TEMPLATES PreTrainedTemplates
ADAPT_TEMPLATES AdaptedTemplates
BIT_VECTOR AllProtosOn
BIT_VECTOR PrunedProtos
BIT_VECTOR AllConfigsOn
BIT_VECTOR AllProtosOff
BIT_VECTOR AllConfigsOff
BIT_VECTOR TempProtoMask
bool EnableLearning
NORM_PROTOSNormProtos
UnicityTable< FontInfofontinfo_table_
UnicityTable< FontSetfontset_table_
int il1_adaption_test = 0
bool classify_bln_numeric_mode = 0

Protected Attributes

IntegerMatcher im_
FEATURE_DEFS_STRUCT feature_defs_
ShapeTableshape_table_

Constructor & Destructor Documentation

tesseract::Classify::Classify (  ) 
tesseract::Classify::~Classify (  )  [virtual]

Member Function Documentation

int tesseract::Classify::AdaptableWord ( TWERD Word,
const WERD_CHOICE BestChoiceWord,
const WERD_CHOICE RawChoiceWord 
)

Return TRUE if the specified word is acceptable for adaptation.

Globals: none

Parameters:
Word current word
BestChoiceWord best overall choice for word with context
RawChoiceWord best choice for word without context
Returns:
TRUE or FALSE
Note:
Exceptions: none
History: Thu May 30 14:25:06 1991, DSJ, Created.
void tesseract::Classify::AdaptiveClassifier ( TBLOB Blob,
const DENORM denorm,
BLOB_CHOICE_LIST *  Choices,
CLASS_PRUNER_RESULTS  CPResults 
)

This routine calls the adaptive matcher which returns (in an array) the class id of each class matched.

It also returns the number of classes matched. For each class matched it places the best rating found for that class into the Ratings array.

Bad matches are then removed so that they don't need to be sorted. The remaining good matches are then sorted and converted to choices.

This routine also performs some simple speckle filtering.

Note:
Exceptions: none
History: Mon Mar 11 10:00:58 1991, DSJ, Created.
Parameters:
Blob blob to be classified
[out] Choices List of choices found by adaptive matcher.
[out] CPResults Array of CPResultStruct of size MAX_NUM_CLASSES is filled on return with the choices found by the class pruner and the ratings therefrom. Also contains the detailed results of the integer matcher.
bool tesseract::Classify::AdaptiveClassifierIsFull (  )  [inline]
void tesseract::Classify::AdaptToChar ( TBLOB Blob,
const DENORM denorm,
CLASS_ID  ClassId,
int  FontinfoId,
FLOAT32  Threshold 
)
Parameters:
Blob blob to add to templates for ClassId
LineStats statistics about text line blob is in
ClassId class to add blob to
FontinfoId font information from pre-trained templates
Threshold minimum match rating to existing template

Globals:

  • AdaptedTemplates current set of adapted templates
  • AllProtosOn dummy mask to match against all protos
  • AllConfigsOn dummy mask to match against all configs
Returns:
none
Note:
Exceptions: none
History: Thu Mar 14 09:36:03 1991, DSJ, Created.
void tesseract::Classify::AdaptToPunc ( TBLOB Blob,
const DENORM denorm,
CLASS_ID  ClassId,
int  FontinfoId,
FLOAT32  Threshold 
)
Parameters:
Blob blob to add to templates for ClassId
LineStats statistics about text line blob is in
ClassId class to add blob to
FontinfoId font information from pre-trained teamples
Threshold minimum match rating to existing template

Globals:

  • PreTrainedTemplates current set of built-in templates
Note:
Exceptions: none
History: Thu Mar 14 09:36:03 1991, DSJ, Created.
void tesseract::Classify::AddNewResult ( ADAPT_RESULTS results,
CLASS_ID  class_id,
int  shape_id,
FLOAT32  rating,
bool  adapted,
int  config,
int  fontinfo_id,
int  fontinfo_id2 
)

This routine adds the result of a classification into Results. If the new rating is much worse than the current best rating, it is not entered into results because it would end up being stripped later anyway. If the new rating is better than the old rating for the class, it replaces the old rating. If this is the first rating for the class, the class is added to the list of matched classes in Results. If the new rating is better than the best so far, it becomes the best so far.

Globals:

Parameters:
[out] results results to add new result to
class_id class of new result
rating rating of new result
config config id of new result
config2 config id of 2nd choice result
fontinfo_id font information of the new result
fontinfo_id2 font information of the 2nd choice result
Note:
Exceptions: none
History: Tue Mar 12 18:19:29 1991, DSJ, Created.
void tesseract::Classify::AmbigClassifier ( TBLOB Blob,
const DENORM denorm,
INT_TEMPLATES  Templates,
ADAPT_CLASS Classes,
UNICHAR_ID Ambiguities,
ADAPT_RESULTS Results 
)

This routine is identical to CharNormClassifier() except that it does no class pruning. It simply matches the unknown blob against the classes listed in Ambiguities.

Globals:

Parameters:
Blob blob to be classified
Templates built-in templates to classify against
Ambiguities array of class id's to match against
[out] Results place to put match results
Note:
Exceptions: none
History: Tue Mar 12 19:40:36 1991, DSJ, Created.
UNICHAR_ID * tesseract::Classify::BaselineClassifier ( TBLOB Blob,
const DENORM denorm,
ADAPT_TEMPLATES  Templates,
ADAPT_RESULTS Results 
)

This routine extracts baseline normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Globals:

  • BaselineCutoffs expected num features for each class
Parameters:
Blob blob to be classified
Templates current set of adapted templates
Results place to put match results
Returns:
Array of possible ambiguous chars that should be checked.
Note:
Exceptions: none
History: Tue Mar 12 19:38:03 1991, DSJ, Created.
int tesseract::Classify::CharNormClassifier ( TBLOB Blob,
const DENORM denorm,
INT_TEMPLATES  Templates,
ADAPT_RESULTS Results 
)

This routine extracts character normalized features from the unknown character and matches them against the specified set of templates. The classes which match are added to Results.

Parameters:
Blob blob to be classified
Templates templates to classify unknown against
Results place to put match results

Globals:

  • CharNormCutoffs expected num features for each class
  • AllProtosOn mask that enables all protos
  • AllConfigsOn mask that enables all configs
Note:
Exceptions: none
History: Tue Mar 12 16:02:52 1991, DSJ, Created.
int tesseract::Classify::CharNormTrainingSample ( bool  pruner_only,
const TrainingSample sample,
GenericVector< ShapeRating > *  results 
)
int tesseract::Classify::ClassAndConfigIDToFontOrShapeID ( int  class_id,
int  int_result_config 
) const
STRING tesseract::Classify::ClassIDToDebugStr ( const INT_TEMPLATES_STRUCT templates,
int  class_id,
int  config_id 
) const
void tesseract::Classify::ClassifyAsNoise ( ADAPT_RESULTS Results  ) 

This routine computes a rating which reflects the likelihood that the blob being classified is a noise blob. NOTE: assumes that the blob length has already been computed and placed into Results.

Parameters:
Results results to add noise classification to

Globals:

  • matcher_avg_noise_size avg. length of a noise blob
Note:
Exceptions: none
History: Tue Mar 12 18:36:52 1991, DSJ, Created.
void tesseract::Classify::ClearCharNormArray ( uinT8 char_norm_array  ) 

For each class in the unicharset, clears the corresponding entry in char_norm_array. char_norm_array is indexed by unichar_id.

Globals:

  • none
Parameters:
char_norm_array array to be cleared
Note:
Exceptions: none
History: Wed Feb 20 11:20:54 1991, DSJ, Created.
void tesseract::Classify::ComputeCharNormArrays ( FEATURE_STRUCT norm_feature,
INT_TEMPLATES_STRUCT templates,
uinT8 char_norm_array,
uinT8 pruner_array 
)
double tesseract::Classify::ComputeCorrectedRating ( bool  debug,
int  unichar_id,
double  cp_rating,
double  im_rating,
int  feature_misses,
int  bottom,
int  top,
int  blob_length,
const uinT8 cn_factors 
)
void tesseract::Classify::ComputeIntCharNormArray ( const FEATURE_STRUCT norm_feature,
uinT8 char_norm_array 
)

For each class in unicharset, computes the match between norm_feature and the normalization protos for that class. Converts this number to the range from 0 - 255 and stores it into char_norm_array. CharNormArray is indexed by unichar_id.

Globals:

  • none
Parameters:
norm_feature character normalization feature
[out] char_norm_array place to put results of size unicharset.size()
Note:
Exceptions: none
History: Wed Feb 20 11:20:54 1991, DSJ, Created.
void tesseract::Classify::ComputeIntFeatures ( FEATURE_SET  Features,
INT_FEATURE_ARRAY  IntFeatures 
)

This routine converts each floating point pico-feature in Features into integer format and saves it into IntFeatures.

Globals:

  • none
Parameters:
Features floating point pico-features to be converted
[out] IntFeatures array to put converted features into
Note:
Exceptions: none
History: Wed Feb 20 10:58:45 1991, DSJ, Created.
FLOAT32 tesseract::Classify::ComputeNormMatch ( CLASS_ID  ClassId,
const FEATURE_STRUCT feature,
BOOL8  DebugMatch 
)
void tesseract::Classify::ConvertMatchesToChoices ( const DENORM denorm,
const TBOX box,
ADAPT_RESULTS Results,
BLOB_CHOICE_LIST *  Choices 
)

The function converts the given match ratings to the list of blob choices with ratings and certainties (used by the context checkers). If character fragments are present in the results, this function also makes sure that there is at least one non-fragmented classification included. For each classification result check the unicharset for "definite" ambiguities and modify the resulting Choices accordingly.

void tesseract::Classify::ConvertProto ( PROTO  Proto,
int  ProtoId,
INT_CLASS  Class 
)
INT_TEMPLATES tesseract::Classify::CreateIntTemplates ( CLASSES  FloatProtos,
const UNICHARSET target_unicharset 
)
void tesseract::Classify::DebugAdaptiveClassifier ( TBLOB Blob,
const DENORM denorm,
ADAPT_RESULTS Results 
)
Parameters:
Blob blob whose classification is being debugged
Results results of match being debugged

Globals: none

Note:
Exceptions: none
History: Wed Mar 13 16:44:41 1991, DSJ, Created.
void tesseract::Classify::DisplayAdaptedChar ( TBLOB blob,
const DENORM denorm,
INT_CLASS_STRUCT int_class 
)
void tesseract::Classify::DoAdaptiveMatch ( TBLOB Blob,
const DENORM denorm,
ADAPT_RESULTS Results 
)

This routine performs an adaptive classification. If we have not yet adapted to enough classes, a simple classification to the pre-trained templates is performed. Otherwise, we match the blob against the adapted templates. If the adapted templates do not match well, we try a match against the pre-trained templates. If an adapted template match is found, we do a match to any pre-trained templates which could be ambiguous. The results from all of these classifications are merged together into Results.

Parameters:
Blob blob to be classified
Results place to put match results

Globals:

  • PreTrainedTemplates built-in training templates
  • AdaptedTemplates templates adapted for this page
  • matcher_great_threshold rating limit for a great match
Note:
Exceptions: none
History: Tue Mar 12 08:50:11 1991, DSJ, Created.
void tesseract::Classify::EndAdaptiveClassifier (  ) 

This routine performs cleanup operations on the adaptive classifier. It should be called before the program is terminated. Its main function is to save the adapted templates to a file.

Globals:

Note:
Exceptions: none
History: Tue Mar 19 14:37:06 1991, DSJ, Created.
void tesseract::Classify::ExpandShapesAndApplyCorrections ( ADAPT_CLASS classes,
bool  debug,
int  class_id,
int  bottom,
int  top,
float  cp_rating,
int  blob_length,
const uinT8 cn_factors,
INT_RESULT_STRUCT int_result,
ADAPT_RESULTS final_results 
)
FEATURE_SET tesseract::Classify::ExtractOutlineFeatures ( TBLOB Blob  ) 
FEATURE_SET tesseract::Classify::ExtractPicoFeatures ( TBLOB Blob  ) 
void tesseract::Classify::FreeNormProtos (  ) 
UnicityTable<FontInfo>& tesseract::Classify::get_fontinfo_table (  )  [inline]
UnicityTable<FontSet>& tesseract::Classify::get_fontset_table (  )  [inline]
int tesseract::Classify::GetAdaptiveFeatures ( TBLOB Blob,
INT_FEATURE_ARRAY  IntFeatures,
FEATURE_SET FloatFeatures 
)

This routine sets up the feature extractor to extract baseline normalized pico-features.

The extracted pico-features are converted to integer form and placed in IntFeatures. The original floating-pt. features are returned in FloatFeatures.

Globals: none

Parameters:
Blob blob to extract features from
LineStats statistics about text row blob is in
[out] IntFeatures array to fill with integer features
[out] FloatFeatures place to return actual floating-pt features
Returns:
Number of pico-features returned (0 if an error occurred)
Note:
Exceptions: none
History: Tue Mar 12 17:55:18 1991, DSJ, Created.
void tesseract::Classify::GetAdaptThresholds ( TWERD Word,
const DENORM denorm,
const WERD_CHOICE BestChoice,
const WERD_CHOICE BestRawChoice,
FLOAT32  Thresholds[] 
)

This routine tries to estimate how tight the adaptation threshold should be set for each character in the current word. In general, the routine tries to set tighter thresholds for a character when the current set of templates would have made an error on that character. It tries to set a threshold tight enough to eliminate the error. Two different sets of rules can be used to determine the desired thresholds.

Parameters:
Word current word
BestChoice best choice for current word with context
BestRawChoice best choice for current word without context
[out] Thresholds array of thresholds to be filled in

Globals:

  • matcher_good_threshold
  • matcher_perfect_threshold
  • matcher_rating_margin
Returns:
none (results are returned in Thresholds)
Note:
Exceptions: none
History: Fri May 31 09:22:08 1991, DSJ, Created.
UNICHAR_ID * tesseract::Classify::GetAmbiguities ( TBLOB Blob,
const DENORM denorm,
CLASS_ID  CorrectClass 
)

This routine matches blob to the built-in templates to find out if there are any classes other than the correct class which are potential ambiguities.

Parameters:
Blob blob to get classification ambiguities for
CorrectClass correct class for Blob

Globals:

  • CurrentRatings used by qsort compare routine
  • PreTrainedTemplates built-in templates
Returns:
String containing all possible ambiguous classes.
Note:
Exceptions: none
History: Fri Mar 15 08:08:22 1991, DSJ, Created.
int tesseract::Classify::GetBaselineFeatures ( TBLOB Blob,
const DENORM denorm,
INT_TEMPLATES  Templates,
INT_FEATURE_ARRAY  IntFeatures,
uinT8 CharNormArray,
inT32 BlobLength 
)

This routine calls the integer (Hardware) feature extractor if it has not been called before for this blob. The results from the feature extractor are placed into globals so that they can be used in other routines without re-extracting the features. It then copies the baseline features into the IntFeatures array provided by the caller.

Parameters:
Blob blob to extract features from
Templates used to compute char norm adjustments
IntFeatures array to fill with integer features
CharNormArray array to fill with dummy char norm adjustments
BlobLength length of blob in baseline-normalized units

Globals:

  • FeaturesHaveBeenExtracted TRUE if fx has been done
  • BaselineFeatures holds extracted baseline feat
  • CharNormFeatures holds extracted char norm feat
  • FXInfo holds misc. FX info
Returns:
Number of features extracted or 0 if an error occured.
Note:
Exceptions: none
History: Tue May 28 10:40:52 1991, DSJ, Created.
int tesseract::Classify::GetCharNormFeatures ( TBLOB Blob,
const DENORM denorm,
INT_TEMPLATES  Templates,
INT_FEATURE_ARRAY  IntFeatures,
uinT8 PrunerNormArray,
uinT8 CharNormArray,
inT32 BlobLength,
inT32 FeatureOutlineArray 
)

This routine calls the integer (Hardware) feature extractor if it has not been called before for this blob.

The results from the feature extractor are placed into globals so that they can be used in other routines without re-extracting the features.

It then copies the char norm features into the IntFeatures array provided by the caller.

Parameters:
Blob blob to extract features from
Templates used to compute char norm adjustments
IntFeatures array to fill with integer features
CharNormArray array to fill with dummy char norm adjustments
BlobLength length of blob in baseline-normalized units

Globals:

  • FeaturesHaveBeenExtracted TRUE if fx has been done
  • BaselineFeatures holds extracted baseline feat
  • CharNormFeatures holds extracted char norm feat
  • FXInfo holds misc. FX info
Returns:
Number of features extracted or 0 if an error occured.
Note:
Exceptions: none
History: Tue May 28 10:40:52 1991, DSJ, Created.
CLASS_ID tesseract::Classify::GetClassToDebug ( const char *  Prompt,
bool *  adaptive_on,
bool *  pretrained_on,
int *  shape_id 
)
Dict& tesseract::Classify::getDict (  )  [inline]
int tesseract::Classify::GetFontinfoId ( ADAPT_CLASS  Class,
uinT8  ConfigId 
)
void tesseract::Classify::InitAdaptedClass ( TBLOB Blob,
const DENORM denorm,
CLASS_ID  ClassId,
int  FontinfoId,
ADAPT_CLASS  Class,
ADAPT_TEMPLATES  Templates 
)

This routine creates a new adapted class and uses Blob as the model for the first config in that class.

Parameters:
Blob blob to model new class after
ClassId id of the class to be initialized
FontinfoId font information inferred from pre-trained templates
Class adapted class to be initialized
Templates adapted templates to add new class to

Globals:

Note:
Exceptions: none
History: Thu Mar 14 12:49:39 1991, DSJ, Created.
void tesseract::Classify::InitAdaptiveClassifier ( bool  load_pre_trained_templates  ) 

This routine reads in the training information needed by the adaptive classifier and saves it into global variables. Parameters: load_pre_trained_templates Indicates whether the pre-trained templates (inttemp, normproto and pffmtable components) should be lodaded. Should only be set to true if the necesary classifier components are present in the [lang].traineddata file. Globals: BuiltInTemplatesFile file to get built-in temps from BuiltInCutoffsFile file to get avg. feat per class from classify_use_pre_adapted_templates enables use of pre-adapted templates

Note:
History: Mon Mar 11 12:49:34 1991, DSJ, Created.
void tesseract::Classify::LearnPieces ( const char *  filename,
int  start,
int  length,
float  threshold,
CharSegmentationType  segmentation,
const char *  correct_text,
WERD_RES word 
)
void tesseract::Classify::LearnWord ( const char *  filename,
const char *  rejmap,
WERD_RES word 
)
bool tesseract::Classify::LooksLikeGarbage ( const DENORM denorm,
TBLOB blob 
)
int tesseract::Classify::MakeNewTemporaryConfig ( ADAPT_TEMPLATES  Templates,
CLASS_ID  ClassId,
int  FontinfoId,
int  NumFeatures,
INT_FEATURE_ARRAY  Features,
FEATURE_SET  FloatFeatures 
)
Parameters:
Templates adapted templates to add new config to
ClassId class id to associate with new config
FontinfoId font information inferred from pre-trained templates
NumFeatures number of features in IntFeatures
Features features describing model for new config
FloatFeatures floating-pt representation of features
Returns:
The id of the new config created, a negative integer in case of error.
Note:
Exceptions: none
History: Fri Mar 15 08:49:46 1991, DSJ, Created.
PROTO_ID tesseract::Classify::MakeNewTempProtos ( FEATURE_SET  Features,
int  NumBadFeat,
FEATURE_ID  BadFeat[],
INT_CLASS  IClass,
ADAPT_CLASS  Class,
BIT_VECTOR  TempProtoMask 
)

This routine finds sets of sequential bad features that all have the same angle and converts each set into a new temporary proto. The temp proto is added to the proto pruner for IClass, pushed onto the list of temp protos in Class, and added to TempProtoMask.

Parameters:
Features floating-pt features describing new character
NumBadFeat number of bad features to turn into protos
BadFeat feature id's of bad features
IClass integer class templates to add new protos to
Class adapted class templates to add new protos to
TempProtoMask proto mask to add new protos to

Globals: none

Returns:
Max proto id in class after all protos have been added. Exceptions: none History: Fri Mar 15 11:39:38 1991, DSJ, Created.
void tesseract::Classify::MakePermanent ( ADAPT_TEMPLATES  Templates,
CLASS_ID  ClassId,
int  ConfigId,
const DENORM denorm,
TBLOB Blob 
)
Parameters:
Templates current set of adaptive templates
ClassId class containing config to be made permanent
ConfigId config to be made permanent
Blob current blob being adapted to

Globals: none

Note:
Exceptions: none
History: Thu Mar 14 15:54:08 1991, DSJ, Created.
void tesseract::Classify::MasterMatcher ( INT_TEMPLATES  templates,
inT16  num_features,
const INT_FEATURE_STRUCT features,
const uinT8 norm_factors,
ADAPT_CLASS classes,
int  debug,
int  num_classes,
const TBOX blob_box,
CLASS_PRUNER_RESULTS  results,
ADAPT_RESULTS final_results 
)

Factored-out calls to IntegerMatcher based on class pruner results. Returns integer matcher results inside CLASS_PRUNER_RESULTS structure.

ADAPT_TEMPLATES tesseract::Classify::NewAdaptedTemplates ( bool  InitFromUnicharset  ) 

Allocates memory for adapted tempates. each char in unicharset to the newly created templates

Parameters:
PopulateFromUnicharset if true, add an empty class for
Returns:
Ptr to new adapted templates.
Note:
Globals: none
Exceptions: none
History: Fri Mar 8 10:15:28 1991, DSJ, Created.
void tesseract::Classify::NormalizeOutlines ( LIST  Outlines,
FLOAT32 XScale,
FLOAT32 YScale 
)
void tesseract::Classify::PrintAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES  Templates 
)

This routine prints a summary of the adapted templates in Templates to File.

Parameters:
File open text file to print Templates to
Templates adapted templates to print to File
Note:
Globals: none
Exceptions: none
History: Wed Mar 20 13:35:29 1991, DSJ, Created.
void tesseract::Classify::PrintAdaptiveMatchResults ( FILE *  File,
ADAPT_RESULTS Results 
)

This routine writes the matches in Results to File.

Parameters:
File open text file to write Results to
Results match results to write to File

Globals: none

Note:
Exceptions: none
History: Mon Mar 18 09:24:53 1991, DSJ, Created.
void tesseract::Classify::PrintAdaptiveStatistics ( FILE *  File  ) 

Print to File the statistics which have been gathered for the adaptive matcher.

Parameters:
File open text file to print adaptive statistics to

Globals: none

Note:
Exceptions: none
History: Thu Apr 18 14:37:37 1991, DSJ, Created.
int tesseract::Classify::PruneClasses ( const INT_TEMPLATES_STRUCT int_templates,
int  num_features,
const INT_FEATURE_STRUCT features,
const uinT8 normalization_factors,
const uinT16 expected_num_features,
CP_RESULT_STRUCT results 
)
ADAPT_TEMPLATES tesseract::Classify::ReadAdaptedTemplates ( FILE *  File  ) 

Read a set of adapted templates from File and return a ptr to the templates.

Parameters:
File open text file to read adapted templates from
Returns:
Ptr to adapted templates read from File.
Note:
Globals: none
Exceptions: none
History: Mon Mar 18 15:18:10 1991, DSJ, Created.
void tesseract::Classify::ReadClassFile (  ) 
INT_TEMPLATES tesseract::Classify::ReadIntTemplates ( FILE *  File  ) 
void tesseract::Classify::ReadNewCutoffs ( FILE *  CutoffFile,
bool  swap,
inT64  end_offset,
CLASS_CUTOFF_ARRAY  Cutoffs 
)
NORM_PROTOS * tesseract::Classify::ReadNormProtos ( FILE *  File,
inT64  end_offset 
)
void tesseract::Classify::RefreshDebugWindow ( ScrollView **  win,
const char *  msg,
int  y_offset,
const TBOX wbox 
)
void tesseract::Classify::RemoveBadMatches ( ADAPT_RESULTS Results  ) 

This routine steps thru each matching class in Results and removes it from the match list if its rating is worse than the BestRating plus a pad. In other words, all good matches get moved to the front of the classes array.

Parameters:
Results contains matches to be filtered

Globals:

  • matcher_bad_match_pad defines a "bad match"
Note:
Exceptions: none
History: Tue Mar 12 13:51:03 1991, DSJ, Created.
void tesseract::Classify::RemoveExtraPuncs ( ADAPT_RESULTS Results  ) 

This routine steps thru each matching class in Results and removes it from the match list if its rating is worse than the BestRating plus a pad. In other words, all good matches get moved to the front of the classes array.

Results contains matches to be filtered

Globals:

  • matcher_bad_match_pad defines a "bad match"
Note:
Exceptions: none
History: Tue Mar 12 13:51:03 1991, DSJ, Created.
void tesseract::Classify::ResetAdaptiveClassifierInternal (  ) 
void tesseract::Classify::ResetFeaturesHaveBeenExtracted (  ) 
void tesseract::Classify::SetAdaptiveThreshold ( FLOAT32  Threshold  ) 

This routine resets the internal thresholds inside the integer matcher to correspond to the specified threshold.

Parameters:
Threshold threshold for creating new templates

Globals:

  • matcher_good_threshold default good match rating
Note:
Exceptions: none
History: Tue Apr 9 08:33:13 1991, DSJ, Created.
void tesseract::Classify::SettupPass1 (  ) 

This routine prepares the adaptive matcher for the start of the first pass. Learning is enabled (unless it is disabled for the whole program).

Note:
this is somewhat redundant, it simply says that if learning is enabled then it will remain enabled on the first pass. If it is disabled, then it will remain disabled. This is only put here to make it very clear that learning is controlled directly by the global setting of EnableLearning.

Globals:

Note:
Exceptions: none
History: Mon Apr 15 16:39:29 1991, DSJ, Created.
void tesseract::Classify::SettupPass2 (  ) 

This routine prepares the adaptive matcher for the start of the second pass. Further learning is disabled.

Globals:

Note:
Exceptions: none
History: Mon Apr 15 16:39:29 1991, DSJ, Created.
const ShapeTable* tesseract::Classify::shape_table (  )  const [inline]
int tesseract::Classify::ShapeIDToClassID ( int  shape_id  )  const
void tesseract::Classify::ShowBestMatchFor ( TBLOB Blob,
const DENORM denorm,
CLASS_ID  ClassId,
int  shape_id,
BOOL8  AdaptiveOn,
BOOL8  PreTrainedOn,
ADAPT_RESULTS Results 
)

This routine compares Blob to both sets of templates (adaptive and pre-trained) and then displays debug information for the config which matched best.

Parameters:
Blob blob to show best matching config for
ClassId class whose configs are to be searched
AdaptiveOn TRUE if adaptive configs are enabled
PreTrainedOn TRUE if pretrained configs are enabled

Globals:

  • PreTrainedTemplates built-in training
  • AdaptedTemplates adaptive templates
  • AllProtosOn dummy proto mask
  • AllConfigsOn dummy config mask
Note:
Exceptions: none
History: Fri Mar 22 08:43:52 1991, DSJ, Created.
void tesseract::Classify::ShowMatchDisplay (  ) 
bool tesseract::Classify::TempConfigReliable ( CLASS_ID  class_id,
const TEMP_CONFIG config 
)
void tesseract::Classify::UpdateAmbigsGroup ( CLASS_ID  class_id,
const DENORM denorm,
TBLOB Blob 
)
void tesseract::Classify::WriteAdaptedTemplates ( FILE *  File,
ADAPT_TEMPLATES  Templates 
)

This routine saves Templates to File in a binary format.

Parameters:
File open text file to write Templates to
Templates set of adapted templates to write to File
Note:
Globals: none
Exceptions: none
History: Mon Mar 18 15:07:32 1991, DSJ, Created.
void tesseract::Classify::WriteIntTemplates ( FILE *  File,
INT_TEMPLATES  Templates,
const UNICHARSET target_unicharset 
)

Member Data Documentation

"Certainty scaling factor"

"Threshold for good features during adaptive 0-255"

"Threshold for good protos during adaptive 0-255"

"Assume the input is numbers [0-9]."

"Character Normalization Range ..."

"Exclude fragments that do not match any whole character" " with at least this certainty"

"Class Pruner Multiplier 0-255: "

"Class Pruner Threshold 0-255"

"Class Pruner CutoffStrength: "

"Bring up graphical debugging windows for fragments training"

"Classify debug level"

"Enable match debugger"

"Enable adaptive classifier"

"Enable adaptive classifier"

"Integer Matcher Multiplier 0-255: "

"Class str to debug learning"

"Learning Debug Level: "

"Max char x-norm scale ..."

"Max char y-norm scale ..."

"Min char x-norm scale ..."

"Min char y-norm scale ..."

"Penalty to apply when a non-alnum is vertically out of " "its expected textline position"

"Normalization Method ..."

"Save adapted templates to a file"

"Use pre-adapted classifier templates"

"Do not include character fragments in the" " results of the classifier"

"Dont adapt to i/I at beginning of word"

"Avg. noise blob length: "

"Bad Match Pad (0-1)"

"Maximum angle delta for prototype clustering"

"Matcher Debug Flags"

"Matcher Debug Level"

"Use two different windows for debugging the matching: " "One for the protos and one for the features."

"Good Match (0-1)"

"Great Match (0-1)"

"Reliable Config Threshold"

"Perfect Match (0-1)"

"Min # of permanent classes"

"New template margin (0-1)"

"Enable adaption even if the ambiguities have not been seen"

"Prioritize blob division over chopping"

"Rating scaling factor"

"Baseline Normalized Matching"

"Character Normalized Matching"

"Scale factor for features not used"

"Top choice only from CP"


The documentation for this class was generated from the following files:
Generated on Thu Feb 2 08:19:26 2012 for Tesseract by  doxygen 1.6.3