?? parseutils.cpp
字號:
log_error(ERROR_INVALID_INTERVAL, " lower > upper", NULL); return false; } else if(line_int.upper > pfile->height) { log_error(ERROR_INVALID_INTERVAL, "upper > scan height", NULL); return false; } int cur_entry_y = scan_ignore_edges_top - 1, cur_line_num = 1; box_t line_box; vector<box_t> chars; line_node lnode; char_node cnode; while(true) { line_box = find_line_box(cur_entry_y, pfile); if(equals_NULL_BOX(line_box)) return true; else if((line_box.vert_len < line_int.lower) || (line_box.vert_len > line_int.upper)) { //printf("***********************************************\n"); log_error(ERROR_INVALID_INTERVAL, "line_box out of int", NULL); //printf("line_box top corner: (x, y) = (%d, %d)\n", // line_box.xy.x, line_box.xy.y); //printf("line_box vert_len = %d\n", line_box.vert_len); //printf("***********************************************\n"); cur_entry_y = line_box.xy.y + line_box.vert_len; continue; } // add line to list: lnode.self_box = line_box; lnode.line_num = cur_line_num; if(plist->pl_begin == NULL) { plist->pl_begin = new parsed_line(lnode); plist->pl_end = plist->pl_begin; } else { plist->pl_end->next = new parsed_line(lnode); plist->pl_end->next->prev = plist->pl_end; plist->pl_end = plist->pl_end->next; } // add chars to line: chars = find_chars(line_box, pfile); cnode.is_space = false; cnode.character = ""; // have yet to interpret for(int i = 0; i < chars.size(); i++) { cnode.self_box = chars[i]; // plist->pl_end is the current parsed_line *: if(plist->pl_end->pc_begin == NULL) { plist->pl_end->pc_begin = new parsed_char(cnode); plist->pl_end->pc_end = plist->pl_end->pc_begin; } else { plist->pl_end->pc_end->next = new parsed_char(cnode); plist->pl_end->pc_end->next->prev = plist->pl_end->pc_end; plist->pl_end->pc_end = plist->pl_end->pc_end->next; } } cur_entry_y = line_box.xy.y + line_box.vert_len; cur_line_num++; } // while}// gridding general utility:box_t find_biggest_char_box_hlen(parse_list *plist) { box_t r_val = NULL_BOX(); int hlen = 0; if(plist->pl_begin == NULL) return r_val; parsed_line *cur_pl = plist->pl_begin; while(cur_pl != NULL) { parsed_char *cur_pc = cur_pl->pc_begin; while(cur_pc != NULL) { if(cur_pc->self_node.is_space) { cur_pc = cur_pc->next; continue; } if(cur_pc->self_node.self_box.horiz_len > hlen) { r_val = cur_pc->self_node.self_box; hlen = cur_pc->self_node.self_box.horiz_len; } cur_pc = cur_pc->next; } cur_pl = cur_pl->next; } return r_val;}box_t find_biggest_char_box_vlen(parse_list *plist) { box_t r_val = NULL_BOX(); int vlen = 0; if(plist->pl_begin == NULL) return r_val; parsed_line *cur_pl = plist->pl_begin; while(cur_pl != NULL) { parsed_char *cur_pc = cur_pl->pc_begin; while(cur_pc != NULL) { if(cur_pc->self_node.is_space) { cur_pc = cur_pc->next; continue; } if(cur_pc->self_node.self_box.vert_len > vlen) { r_val = cur_pc->self_node.self_box; vlen = cur_pc->self_node.self_box.vert_len; } cur_pc = cur_pc->next; } cur_pl = cur_pl->next; } return r_val;}grid_t grid_char(unsigned char *buf, int buf_rows, int buf_cols, int pos_offset_x, int pos_offset_y, int gsx, int gsy, int gcx, int gcy) { grid_t r_grid = zero_matrix(gsy / gcy, gsx / gcx); int grid_y_pos = -1, grid_x_pos = -1; for(int i = 0; i < buf_rows; i++) { for(int j = 0; j < buf_cols; j++) { if(grid(buf, buf_rows, buf_cols, i, j)) { // find grid cell the pixel belongs to: grid_y_pos = (pos_offset_y + i) / gcy; grid_x_pos = (pos_offset_x + j) / gcx; if((grid_y_pos >= (gsy / gcy)) || (grid_x_pos >= (gsx / gcx))) { log_error(ERROR_GRID_ERROR, "in grid char: the grid size is too small", NULL); return r_grid; } r_grid[grid_y_pos][grid_x_pos] = true; } } } return r_grid; }grid_t grid_char(box_t char_box) { int gsx = trcfg_grid_size_x, gsy = trcfg_grid_size_y, gcx = trcfg_gridcell_size_x, gcy = trcfg_gridcell_size_y; grid_t r_grid; // compute offsets: if((gsx < char_box.horiz_len) || (gsy < char_box.vert_len)) { log_error(ERROR_GRID_ERROR, "in grid_char: grid is smaller than char", NULL); return r_grid; } r_grid = zero_matrix(gsy / gcy, gsx / gcx); int grid_offset_x = (gsx - char_box.horiz_len) / 2; int grid_offset_y = (gsy - char_box.vert_len) / 2; int grid_y_pos, grid_x_pos; for(int i = 0; i < char_box.vert_len; i++) { for(int j = 0; j < char_box.horiz_len; j++) { if(pf->grid(char_box.xy.y + i, char_box.xy.x + j)) { grid_y_pos = (grid_offset_y + i) / gcy; grid_x_pos = (grid_offset_x + j) / gcx; if((grid_y_pos >= (gsy / gcy)) || (grid_x_pos >= (gsx / gcx))) { log_error(ERROR_GRID_ERROR, "in grid_char: grid size too small", NULL); return r_grid; } r_grid[grid_y_pos][grid_x_pos] = true; } } // for j } // for i return r_grid;}vector<char_probability> identify_character(grid_t char_grid) { vector<char_probability> r_val; if(prof == NULL) return r_val; vector<char_probability>::iterator iter; char_probability chpr; double total_pr, num, denom; double gc_l_hits, notgc_l_hits, l_insts, total_gc_hits, total_insts, total_notgc_hits; double cur_pr, p_gc_given_l, p_notgc_given_l, p_letter, p_gc, p_notgc; bool pr_done; for(int i = 0; i < (prof->character_nodes.size()); i++) { num = 1.0; denom = 1.0; pr_done = false; for(int j = 0; j < char_grid.size(); j++) { for(int k = 0; k < char_grid[j].size(); k++) { if((prof->character_nodes[i].num_instances) == 0) { num = 0.0; pr_done = true; break; } gc_l_hits = (double) prof->character_nodes[i].gc_count[j][k]; l_insts = (double) prof->character_nodes[i].num_instances; notgc_l_hits = l_insts - gc_l_hits; total_insts = (double) prof->total_instances; total_gc_hits = (double) prof->gridcell_totals[j][k]; total_notgc_hits = total_insts - total_gc_hits; p_letter = 1.0 / NUM_TRAINED_CHARS; if(char_grid[j][k]) { if((prof->character_nodes[i].gc_count[j][k]) > 0) p_gc_given_l = gc_l_hits / l_insts; else p_gc_given_l = ZERO_INST_PR1_PGIVEN; if((prof->gridcell_totals[j][k]) > 0) p_gc = total_gc_hits / total_insts; else p_gc = ZERO_INST_PR1_PGC; cur_pr = p_gc_given_l * p_letter / p_gc; } else { // grid cells NOT hit if((prof->character_nodes[i].num_instances - prof->character_nodes[i].gc_count[j][k]) > 0) p_notgc_given_l = notgc_l_hits / l_insts; else p_notgc_given_l = ZERO_INST_PR0_PGIVEN; if((prof->total_instances - prof->gridcell_totals[j][k]) > 0) p_notgc = total_notgc_hits / total_insts; else p_notgc = ZERO_INST_PR0_PGC; cur_pr = p_notgc_given_l * p_letter / p_notgc; } num *= cur_pr * PR_SCALING_FACTOR; denom *= (1.0 - cur_pr) * PR_SCALING_FACTOR; } // for k if(pr_done) break; } // for j // p0 = p(letter | gridcell 0) [or NOT gridcell 0]; // p(letter | gridcell 0) = p(gridcell 0 | letter) * p(letter) // __________________________________ // p(gridcell 0) // // naive bayes (see paulgraham.com/naivebayes.html): // p0 * ... * pn // total_pr = _________________________________________ // p0 * ... * pn + (1 - p0) * ... * (1 - pn) total_pr = num / (num + denom); iter = r_val.begin(); chpr.character = prof->character_nodes[i].character; chpr.probability = total_pr; // add to r_val (keep sorted): if(r_val.size() == 0) r_val.push_back(chpr); else { for(int l = 0; l < r_val.size(); l++) { if((l == 0) && (total_pr >= r_val[0].probability)) { r_val.insert(iter, chpr); break; } else if((l != 0) && (total_pr <= r_val[l - 1].probability) && (total_pr >= r_val[l].probability)) { r_val.insert(iter, chpr); break; } else if(l == (r_val.size() - 1)) { r_val.push_back(chpr); break; } iter++; } // for l } // else } // for i}// other utility:plist_parsed_char point_to_char(parse_list *plist, int x, int y, bool search_alternate) { plist_parsed_char r_val; if((plist == NULL) || (plist->pl_begin == NULL) || (plist->pl_begin->pc_begin == NULL)) { r_val.pc = NULL; r_val.pl = NULL; return r_val; } // find line with y point (if any): parsed_line *pl = plist->pl_begin; bool line_found = false; while(pl != NULL) { if((pl->self_node.self_box.xy.y <= y) && ((pl->self_node.self_box.xy.y + pl->self_node.self_box.vert_len) >= y) && (pl->self_node.self_box.xy.x <= x) && ((pl->self_node.self_box.xy.x + pl->self_node.self_box.horiz_len) >= x)) { line_found = true; break; } pl = pl->next; } if(line_found) r_val.pl = pl; else { r_val.pl = NULL; r_val.pc = NULL; return r_val; } // find the char box (if any): parsed_char *pc = pl->pc_begin; bool char_found = false, checking_next = false; while(pc != NULL) { if((pc->self_node.self_box.xy.x <= x) && ((pc->self_node.self_box.xy.x + pc->self_node.self_box.horiz_len) >= x) && (pc->self_node.self_box.xy.y <= y) && ((pc->self_node.self_box.xy.y + pc->self_node.self_box.vert_len) >= y)) { if(checking_next) { char_found = true; break; } if(search_alternate && (pc->next != NULL)) checking_next = true; else { char_found = true; break; } } else if(checking_next) { char_found = true; pc = pc->prev; break; } pc = pc->next; } if(char_found) r_val.pc = pc; else { r_val.pl = NULL; r_val.pc = NULL; return r_val; } return r_val;}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -