00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00020
00021 #ifndef TESSERACT_CCUTIL_AMBIGS_H_
00022 #define TESSERACT_CCUTIL_AMBIGS_H_
00023
00024 #include "elst.h"
00025 #include "tprintf.h"
00026 #include "unichar.h"
00027 #include "unicharset.h"
00028 #include "genericvector.h"
00029
00030 #define MAX_AMBIG_SIZE 10
00031
00032 namespace tesseract {
00033
00034 typedef GenericVector<UNICHAR_ID> UnicharIdVector;
00035
00036 static const int kUnigramAmbigsBufferSize = 1000;
00037 static const char kAmbigNgramSeparator[] = { ' ', '\0' };
00038 static const char kAmbigDelimiters[] = "\t ";
00039 static const char kIllegalMsg[] =
00040 "Illegal ambiguity specification on line %d\n";
00041 static const char kIllegalUnicharMsg[] =
00042 "Illegal unichar %s in ambiguity specification\n";
00043
00044 enum AmbigType {
00045 NOT_AMBIG,
00046 REPLACE_AMBIG,
00047 DEFINITE_AMBIG,
00048 SIMILAR_AMBIG,
00049 CASE_AMBIG,
00050
00051 AMBIG_TYPE_COUNT
00052 };
00053
00054
00055
00056 class UnicharIdArrayUtils {
00057 public:
00058
00059
00060
00061
00062 static inline int compare(const UNICHAR_ID array1[],
00063 const UNICHAR_ID array2[]) {
00064 const UNICHAR_ID *ptr1 = array1;
00065 const UNICHAR_ID *ptr2 = array2;
00066 while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) {
00067 if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1;
00068 ++ptr1;
00069 ++ptr2;
00070 }
00071 if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0;
00072 return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1;
00073 }
00074
00075
00076
00077 static inline int find_in(const UnicharIdVector& uid_vec,
00078 const UNICHAR_ID uid) {
00079 for (int i = 0; i < uid_vec.size(); ++i)
00080 if (uid_vec[i] == uid) return i;
00081 return -1;
00082 }
00083
00084
00085
00086
00087 static inline int copy(const UNICHAR_ID src[], UNICHAR_ID dst[]) {
00088 int i = 0;
00089 do {
00090 dst[i] = src[i];
00091 } while (dst[i++] != INVALID_UNICHAR_ID);
00092 return i - 1;
00093 }
00094
00095
00096
00097 static inline void print(const UNICHAR_ID array[],
00098 const UNICHARSET &unicharset) {
00099 const UNICHAR_ID *ptr = array;
00100 if (*ptr == INVALID_UNICHAR_ID) tprintf("[Empty]");
00101 while (*ptr != INVALID_UNICHAR_ID) {
00102 tprintf("%s ", unicharset.id_to_unichar(*ptr++));
00103 }
00104 tprintf("( ");
00105 ptr = array;
00106 while (*ptr != INVALID_UNICHAR_ID) tprintf("%d ", *ptr++);
00107 tprintf(")\n");
00108 }
00109 };
00110
00111
00112
00113 class AmbigSpec : public ELIST_LINK {
00114 public:
00115 AmbigSpec();
00116 ~AmbigSpec() {}
00117
00118
00119
00120
00121 static int compare_ambig_specs(const void *spec1, const void *spec2) {
00122 const AmbigSpec *s1 =
00123 *reinterpret_cast<const AmbigSpec * const *>(spec1);
00124 const AmbigSpec *s2 =
00125 *reinterpret_cast<const AmbigSpec * const *>(spec2);
00126 return UnicharIdArrayUtils::compare(s1->wrong_ngram, s2->wrong_ngram);
00127 }
00128
00129 UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
00130 UNICHAR_ID correct_fragments[MAX_AMBIG_SIZE + 1];
00131 UNICHAR_ID correct_ngram_id;
00132 AmbigType type;
00133 int wrong_ngram_size;
00134 };
00135 ELISTIZEH(AmbigSpec);
00136
00137
00138
00139 typedef GenericVector<AmbigSpec_LIST *> UnicharAmbigsVector;
00140
00141 class UnicharAmbigs {
00142 public:
00143 UnicharAmbigs() {}
00144 ~UnicharAmbigs() {
00145 replace_ambigs_.delete_data_pointers();
00146 dang_ambigs_.delete_data_pointers();
00147 one_to_one_definite_ambigs_.delete_data_pointers();
00148 }
00149
00150 const UnicharAmbigsVector &dang_ambigs() const { return dang_ambigs_; }
00151 const UnicharAmbigsVector &replace_ambigs() const { return replace_ambigs_; }
00152
00153
00154
00155
00156
00157
00158
00159
00160
00161
00162
00163 void LoadUnicharAmbigs(FILE *ambigs_file, inT64 end_offset, int debug_level,
00164 bool use_ambigs_for_adaption, UNICHARSET *unicharset);
00165
00166
00167 inline const UnicharIdVector *OneToOneDefiniteAmbigs(
00168 UNICHAR_ID unichar_id) const {
00169 if (one_to_one_definite_ambigs_.empty()) return NULL;
00170 return one_to_one_definite_ambigs_[unichar_id];
00171 }
00172
00173
00174
00175
00176
00177
00178 inline const UnicharIdVector *AmbigsForAdaption(
00179 UNICHAR_ID unichar_id) const {
00180 if (ambigs_for_adaption_.empty()) return NULL;
00181 return ambigs_for_adaption_[unichar_id];
00182 }
00183
00184
00185
00186
00187 inline const UnicharIdVector *ReverseAmbigsForAdaption(
00188 UNICHAR_ID unichar_id) const {
00189 if (reverse_ambigs_for_adaption_.empty()) return NULL;
00190 return reverse_ambigs_for_adaption_[unichar_id];
00191 }
00192
00193 private:
00194
00195 bool ParseAmbiguityLine(int line_num, int version, int debug_level,
00196 const UNICHARSET &unicharset, char *buffer,
00197 int *TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
00198 int *ReplacementAmbigPartSize,
00199 char *ReplacementString, int *type);
00200 void InsertIntoTable(UnicharAmbigsVector &table,
00201 int TestAmbigPartSize, UNICHAR_ID *TestUnicharIds,
00202 int ReplacementAmbigPartSize,
00203 const char *ReplacementString, int type,
00204 AmbigSpec *ambig_spec, UNICHARSET *unicharset);
00205 UnicharAmbigsVector dang_ambigs_;
00206 UnicharAmbigsVector replace_ambigs_;
00207 GenericVector<UnicharIdVector *> one_to_one_definite_ambigs_;
00208 GenericVector<UnicharIdVector *> ambigs_for_adaption_;
00209 GenericVector<UnicharIdVector *> reverse_ambigs_for_adaption_;
00210 };
00211
00212 }
00213
00214 #endif // TESSERACT_CCUTIL_AMBIGS_H_