00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #ifndef TRIE_H
00026 #define TRIE_H
00027
00028 #include "dawg.h"
00029 #include "cutil.h"
00030 #include "genericvector.h"
00031
00032 class UNICHARSET;
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044 typedef inT64 EDGE_INDEX;
00045 typedef bool *NODE_MARKER;
00046 typedef GenericVector<EDGE_RECORD> EDGE_VECTOR;
00047
00048 struct TRIE_NODE_RECORD {
00049 EDGE_VECTOR forward_edges;
00050 EDGE_VECTOR backward_edges;
00051 };
00052 typedef GenericVector<TRIE_NODE_RECORD *> TRIE_NODES;
00053
00054 namespace tesseract {
00055
00062 class Trie : public Dawg {
00063 public:
00064 enum RTLReversePolicy {
00065 RRP_DO_NO_REVERSE,
00066 RRP_REVERSE_IF_HAS_RTL,
00067 RRP_FORCE_REVERSE,
00068 };
00069
00070
00071 static const int kSaneNumConcreteChars = 4;
00072
00073
00074
00075 static const char kAlphaPatternUnicode[];
00076 static const char kDigitPatternUnicode[];
00077 static const char kAlphanumPatternUnicode[];
00078 static const char kPuncPatternUnicode[];
00079 static const char kLowerPatternUnicode[];
00080 static const char kUpperPatternUnicode[];
00081
00082 static const char *get_reverse_policy_name(
00083 RTLReversePolicy reverse_policy);
00084
00085
00086
00087
00088
00089 Trie(DawgType type, const STRING &lang, PermuterType perm,
00090 uinT64 max_num_edges, int unicharset_size, int debug_level) {
00091 init(type, lang, perm, unicharset_size, debug_level);
00092 num_edges_ = 0;
00093 max_num_edges_ = max_num_edges;
00094 deref_node_index_mask_ = ~letter_mask_;
00095 new_dawg_node();
00096 initialized_patterns_ = false;
00097 }
00098 virtual ~Trie() { nodes_.delete_data_pointers(); }
00099
00100
00101 void clear();
00102
00104 EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id,
00105 bool word_end) const {
00106 EDGE_RECORD *edge_ptr;
00107 EDGE_INDEX edge_index;
00108 if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
00109 &edge_ptr, &edge_index)) return NO_EDGE;
00110 return make_edge_ref(node_ref, edge_index);
00111 }
00112
00117 void unichar_ids_of(NODE_REF node, NodeChildVector *vec) const {
00118 const EDGE_VECTOR &forward_edges =
00119 nodes_[static_cast<int>(node)]->forward_edges;
00120 for (int i = 0; i < forward_edges.size(); ++i) {
00121 vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
00122 make_edge_ref(node, i)));
00123 }
00124 }
00125
00130 NODE_REF next_node(EDGE_REF edge_ref) const {
00131 if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
00132 return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
00133 }
00134
00139 bool end_of_word(EDGE_REF edge_ref) const {
00140 if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
00141 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
00142 }
00143
00145 UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
00146 if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
00147 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
00148 }
00149
00150
00151
00152 void print_node(NODE_REF node, int max_num_edges) const;
00153
00154
00155
00156
00157
00158 SquishedDawg *trie_to_dawg();
00159
00160
00161
00162
00163 bool read_word_list(const char *filename,
00164 const UNICHARSET &unicharset,
00165 Trie::RTLReversePolicy reverse);
00166
00167
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198
00199
00200
00201
00202
00203
00204
00205
00206
00207
00208 bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
00209
00210
00211
00212 void initialize_patterns(UNICHARSET *unicharset);
00213
00214
00215
00216 void unichar_id_to_patterns(UNICHAR_ID unichar_id,
00217 const UNICHARSET &unicharset,
00218 GenericVector<UNICHAR_ID> *vec) const;
00219
00220
00221
00222
00223 virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref,
00224 UNICHAR_ID unichar_id,
00225 bool word_end) const {
00226 if (edge_ref == NO_EDGE) return NO_EDGE;
00227 EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
00228 return (marker_flag_from_edge_rec(*edge_rec) &&
00229 unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
00230 word_end == end_of_word_from_edge_rec(*edge_rec)) ?
00231 edge_ref : NO_EDGE;
00232 }
00233
00234
00235
00236
00237
00238
00239
00240
00241
00242
00243 bool add_word_to_dawg(const WERD_CHOICE &word,
00244 const GenericVector<bool> *repetitions);
00245 bool add_word_to_dawg(const WERD_CHOICE &word) {
00246 return add_word_to_dawg(word, NULL);
00247 }
00248
00249 protected:
00250
00251
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267 inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
00268 int edge_index = static_cast<int>(
00269 (edge_ref & letter_mask_) >> LETTER_START_BIT);
00270 int node_index = static_cast<int>(
00271 (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
00272 TRIE_NODE_RECORD *node_rec = nodes_[node_index];
00273 return &(node_rec->forward_edges[edge_index]);
00274 }
00276 inline EDGE_REF make_edge_ref(NODE_REF node_index,
00277 EDGE_INDEX edge_index) const {
00278 return ((node_index << flag_start_bit_) |
00279 (edge_index << LETTER_START_BIT));
00280 }
00282 inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats,
00283 int direction, bool word_end, UNICHAR_ID unichar_id) {
00284 EDGE_RECORD flags = 0;
00285 if (repeats) flags |= MARKER_FLAG;
00286 if (word_end) flags |= WERD_END_FLAG;
00287 if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
00288 *edge = ((nxt << next_node_start_bit_) |
00289 (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
00290 (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
00291 }
00293 inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
00294 tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
00295 marker_flag_from_edge_rec(edge_rec) ? "R," : "",
00296 (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
00297 end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
00298 unichar_id_from_edge_rec(edge_rec));
00299 }
00300
00301
00302 inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
00303 NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
00304 return (node_ref != NO_EDGE &&
00305 nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
00306 }
00307
00308
00309
00310 void print_all(const char* msg, int max_num_edges) {
00311 tprintf("\n__________________________\n%s\n", msg);
00312 for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
00313 tprintf("__________________________\n");
00314 }
00315
00316
00317
00318
00319
00320 bool edge_char_of(NODE_REF node_ref, NODE_REF next_node,
00321 int direction, bool word_end, UNICHAR_ID unichar_id,
00322 EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
00323
00324
00325
00326 bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats,
00327 int direction, bool word_end,
00328 UNICHAR_ID unichar_id);
00329
00330
00331
00332 bool add_new_edge(NODE_REF node1, NODE_REF node2,
00333 bool repeats, bool word_end, UNICHAR_ID unichar_id) {
00334 return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
00335 word_end, unichar_id) &&
00336 add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
00337 word_end, unichar_id));
00338 }
00339
00340
00341
00342 void add_word_ending(EDGE_RECORD *edge,
00343 NODE_REF the_next_node,
00344 bool repeats,
00345 UNICHAR_ID unichar_id);
00346
00347
00348 NODE_REF new_dawg_node();
00349
00350
00351
00352 void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
00353 bool word_end, UNICHAR_ID unichar_id);
00354
00355
00356
00357 void remove_edge(NODE_REF node1, NODE_REF node2,
00358 bool word_end, UNICHAR_ID unichar_id) {
00359 remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
00360 remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
00361 }
00362
00363
00364
00365
00366 bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
00367 const EDGE_RECORD &edge2);
00368
00369
00370
00371
00372
00373
00374 bool reduce_lettered_edges(EDGE_INDEX edge_index,
00375 UNICHAR_ID unichar_id,
00376 NODE_REF node,
00377 const EDGE_VECTOR &backward_edges,
00378 NODE_MARKER reduced_nodes);
00379
00386 void sort_edges(EDGE_VECTOR *edges);
00387
00389 void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes);
00390
00391
00392 UNICHAR_ID character_class_to_pattern(char ch);
00393
00394
00395 TRIE_NODES nodes_;
00396 uinT64 num_edges_;
00397 uinT64 max_num_edges_;
00398 uinT64 deref_direction_mask_;
00399 uinT64 deref_node_index_mask_;
00400
00401
00402 bool initialized_patterns_;
00403 UNICHAR_ID alpha_pattern_;
00404 UNICHAR_ID digit_pattern_;
00405 UNICHAR_ID alphanum_pattern_;
00406 UNICHAR_ID punc_pattern_;
00407 UNICHAR_ID lower_pattern_;
00408 UNICHAR_ID upper_pattern_;
00409 };
00410 }
00411
00412 #endif