Figure 1: The Document Information Extraction generally first looks for bar or QR codes in the document (Step 1), then applies OCR (Step 2), looks for keywords (Step 3), applies DocReader (Step 4) and Chargrid (Step 5) and then does the data enrichment with given metadata (Step 6).
Figure 2: OCR Endpoint of the Swagger UI of the Document Information Extraction Service.
First the lines of the document are detected.
{
"results": {
"1": [
{
"word_boxes": [
{
"bbox": [
[
141,
103
],
[
263,
262
]
],
"content": "�"
},
{
"bbox": [
[
398,
99
],
[
894,
260
]
],
"content": "Sights"
}
],
"bbox": [
[
141,
99
],
[
894,
262
]
]
},
{
"word_boxes": [
{
"bbox": [
[
62,
322
],
[
699,
473
]
],
"content": "Downtown"
},
{
"bbox": [
[
777,
303
],
[
1219,
447
]
],
"content": "Toronto"
},
{
"bbox": [
[
1290,
297
],
[
1386,
429
]
],
"content": "is"
},
{
"bbox": [
[
1458,
265
],
[
1606,
416
]
],
"content": "an"
},
{
"bbox": [
[
1687,
280
],
[
2494,
476
]
],
"content": "easy-to-navigate"
}
],
"bbox": [
[
65,
306
],
[
2490,
484
]
]
},
{
"word_boxes": [
{
"bbox": [
[
58,
502
],
[
333,
634
]
],
"content": "grid,"
},
{
"bbox": [
[
415,
481
],
[
903,
621
]
],
"content": "bounded"
},
{
"bbox": [
[
989,
473
],
[
1115,
600
]
],
"content": "by"
},
{
"bbox": [
[
1185,
468
],
[
1251,
593
]
],
"content": "a"
},
{
"bbox": [
[
1331,
441
],
[
2044,
610
]
],
"content": "hodgepodge"
},
{
"bbox": [
[
2116,
468
],
[
2219,
616
]
],
"content": "of"
},
{
"bbox": [
[
2281,
474
],
[
2494,
626
]
],
"content": "bohe"
}
],
"bbox": [
[
59,
449
],
[
2492,
648
]
]
},
{
"word_boxes": [
{
"bbox": [
[
66,
629
],
[
397,
812
]
],
"content": "mian,"
},
{
"bbox": [
[
486,
624
],
[
834,
806
]
],
"content": "ethnic"
},
{
"bbox": [
[
914,
620
],
[
1116,
801
]
],
"content": "and"
},
{
"bbox": [
[
1206,
613
],
[
1644,
797
]
],
"content": "historic"
},
{
"bbox": [
[
1722,
603
],
[
2484,
790
]
],
"content": "neighborhoods"
}
],
"bbox": [
[
66,
603
],
[
2484,
812
]
]
},
{
"word_boxes": [
{
"bbox": [
[
76,
797
],
[
420,
959
]
],
"content": "Yonge"
},
{
"bbox": [
[
492,
794
],
[
623,
953
]
],
"content": "St,"
},
{
"bbox": [
[
694,
791
],
[
872,
950
]
],
"content": "the"
},
{
"bbox": [
[
955,
784
],
[
1337,
947
]
],
"content": "world's"
},
{
"bbox": [
[
1404,
777
],
[
1863,
940
]
],
"content": "longest,"
},
{
"bbox": [
[
1936,
770
],
[
2337,
933
]
],
"content": "dissects"
},
{
"bbox": [
[
2384,
768
],
[
2502,
926
]
],
"content": "the"
}
],
"bbox": [
[
76,
768
],
[
2502,
959
]
]
},
{
"word_boxes": [
{
"bbox": [
[
76,
973
],
[
310,
1113
]
],
"content": "city:"
},
{
"bbox": [
[
384,
969
],
[
583,
1108
]
],
"content": "any"
},
{
"bbox": [
[
643,
958
],
[
1232,
1104
]
],
"content": "downtown"
},
{
"bbox": [
[
1294,
951
],
[
1626,
1093
]
],
"content": "street"
},
{
"bbox": [
[
1686,
946
],
[
1939,
1086
]
],
"content": "with"
},
{
"bbox": [
[
2006,
943
],
[
2136,
1081
]
],
"content": "an"
},
{
"bbox": [
[
2211,
938
],
[
2395,
1078
]
],
"content": "East"
},
{
"bbox": [
[
2437,
937
],
[
2503,
1074
]
],
"content": "or"
}
],
"bbox": [
[
76,
937
],
[
2503,
1113
]
]
},
{
"word_boxes": [
{
"bbox": [
[
86,
1133
],
[
366,
1269
]
],
"content": "West"
},
{
"bbox": [
[
418,
1123
],
[
1063,
1265
]
],
"content": "designation"
},
{
"bbox": [
[
1122,
1118
],
[
1442,
1255
]
],
"content": "refers"
},
{
"bbox": [
[
1493,
1116
],
[
1607,
1250
]
],
"content": "to"
},
{
"bbox": [
[
1661,
1113
],
[
1805,
1248
]
],
"content": "its"
},
{
"bbox": [
[
1862,
1106
],
[
2294,
1245
]
],
"content": "position"
},
{
"bbox": [
[
2339,
1103
],
[
2510,
1238
]
],
"content": "rela-"
}
],
"bbox": [
[
86,
1103
],
[
2510,
1269
]
]
},
{
"word_boxes": [
{
"bbox": [
[
85,
1294
],
[
294,
1429
]
],
"content": "tive"
},
{
"bbox": [
[
346,
1292
],
[
455,
1426
]
],
"content": "to"
},
{
"bbox": [
[
517,
1286
],
[
866,
1423
]
],
"content": "Yonge."
},
{
"bbox": [
[
928,
1280
],
[
1287,
1418
]
],
"content": "Unlike"
},
{
"bbox": [
[
1343,
1276
],
[
1606,
1412
]
],
"content": "New"
},
{
"bbox": [
[
1657,
1271
],
[
1954,
1407
]
],
"content": "York,"
},
{
"bbox": [
[
2004,
1266
],
[
2280,
1402
]
],
"content": "there"
},
{
"bbox": [
[
2321,
1264
],
[
2390,
1398
]
],
"content": "is"
},
{
"bbox": [
[
2429,
1263
],
[
2517,
1396
]
],
"content": "no"
}
],
"bbox": [
[
85,
1263
],
[
2517,
1429
]
]
},
{
"word_boxes": [
{
"bbox": [
[
92,
1447
],
[
699,
1578
]
],
"content": "distinction"
},
{
"bbox": [
[
793,
1441
],
[
1251,
1570
]
],
"content": "between"
},
{
"bbox": [
[
1338,
1437
],
[
1526,
1563
]
],
"content": "the"
},
{
"bbox": [
[
1614,
1430
],
[
2193,
1560
]
],
"content": "directions"
},
{
"bbox": [
[
2266,
1428
],
[
2353,
1552
]
],
"content": "of"
},
{
"bbox": [
[
2419,
1426
],
[
2522,
1551
]
],
"content": "av"
}
],
"bbox": [
[
92,
1426
],
[
2522,
1578
]
]
},
{
"word_boxes": [
{
"bbox": [
[
97,
1618
],
[
426,
1758
]
],
"content": "enues"
},
{
"bbox": [
[
474,
1615
],
[
675,
1753
]
],
"content": "and"
},
{
"bbox": [
[
721,
1609
],
[
1114,
1750
]
],
"content": "streets:"
},
{
"bbox": [
[
1162,
1602
],
[
1635,
1744
]
],
"content": "Spadina"
},
{
"bbox": [
[
1692,
1598
],
[
1902,
1737
]
],
"content": "Ave"
},
{
"bbox": [
[
1979,
1594
],
[
2201,
1733
]
],
"content": "runs"
},
{
"bbox": [
[
2242,
1590
],
[
2531,
1729
]
],
"content": "north-"
}
],
"bbox": [
[
97,
1590
],
[
2531,
1758
]
]
},
{
"word_boxes": [
{
"bbox": [
[
101,
1774
],
[
452,
1914
]
],
"content": "south,"
},
{
"bbox": [
[
541,
1771
],
[
719,
1909
]
],
"content": "but"
},
{
"bbox": [
[
810,
1764
],
[
1305,
1906
]
],
"content": "Danforth"
},
{
"bbox": [
[
1395,
1761
],
[
1612,
1900
]
],
"content": "Ave"
},
{
"bbox": [
[
1729,
1757
],
[
1976,
1896
]
],
"content": "runs"
},
{
"bbox": [
[
2057,
1750
],
[
2535,
1892
]
],
"content": "east-west"
}
],
"bbox": [
[
101,
1750
],
[
2535,
1914
]
]
},
{
"word_boxes": [
{
"bbox": [
[
114,
1927
],
[
507,
2071
]
],
"content": "There's"
},
{
"bbox": [
[
570,
1925
],
[
775,
2069
]
],
"content": "also"
},
{
"bbox": [
[
835,
1925
],
[
888,
2068
]
],
"content": "a"
},
{
"bbox": [
[
949,
1923
],
[
1263,
2067
]
],
"content": "street"
},
{
"bbox": [
[
1329,
1921
],
[
1662,
2066
]
],
"content": "called"
},
{
"bbox": [
[
1742,
1919
],
[
2171,
2064
]
],
"content": "Avenue"
},
{
"bbox": [
[
2235,
1918
],
[
2389,
2061
]
],
"content": "Rd."
},
{
"bbox": [
[
2435,
1917
],
[
2539,
2061
]
],
"content": "Go"
}
],
"bbox": [
[
114,
1917
],
[
2538,
2071
]
]
},
{
"word_boxes": [
{
"bbox": [
[
131,
2084
],
[
485,
2218
]
],
"content": "figure!"
}
],
"bbox": [
[
131,
2084
],
[
485,
2218
]
]
}
]
}
}
OCR Output of the image above using the DOX API
Figure 3: Network Structure of DocReader's Encoder and Spatial Aware Memory.
Figure 4: Network Architecture of DocReader's Decoder with Attention Layer.
Figure 5: Attention weights projected back onto input document.
Figure 6: Network Architecture of Chargrid.
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
20 | |
9 | |
8 | |
7 | |
7 | |
7 | |
7 | |
6 | |
5 | |
5 |