From c8ac08c247d743d2aa1d41e0597894f1560afa65 Mon Sep 17 00:00:00 2001 From: Doug Qian Date: Thu, 16 Feb 2023 17:28:47 -0800 Subject: [PATCH 1/5] Fix child table header merge in 'merge_tables' pipeline --- src-python/trp/trp2.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src-python/trp/trp2.py b/src-python/trp/trp2.py index 9a7d4c0..fb52cb4 100644 --- a/src-python/trp/trp2.py +++ b/src-python/trp/trp2.py @@ -778,6 +778,12 @@ def merge_tables(self, table_array_ids: List[List[str]]): cell_block = self.get_block_by_id(cell_id) if cell_block and cell_block.row_index and parent_last_row: cell_block.row_index = parent_last_row + cell_block.row_index + # This is to make sure the child table's headers are merged + # as regular rows into the parent. + if cell_block.entity_types and len(cell_block.entity_types) > 0: + cell_block.entity_types = [ + entity_type for entity_type in cell_block.entity_types if entity_type != TextractEntityTypes.COLUMN_HEADER.name] + if parent_relationships.ids and cell_id not in parent_relationships.ids: parent_relationships.ids.append(cell_id) self.delete_blocks([table_id]) From ceced044983689ff3188f2a2c6eae7534162348d Mon Sep 17 00:00:00 2001 From: Doug Qian Date: Thu, 16 Feb 2023 17:31:50 -0800 Subject: [PATCH 2/5] Add new TextractEntityTypes --- src-python/trp/trp2.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src-python/trp/trp2.py b/src-python/trp/trp2.py index fb52cb4..1e59718 100644 --- a/src-python/trp/trp2.py +++ b/src-python/trp/trp2.py @@ -52,6 +52,7 @@ class TextractBlockTypes(Enum): class TextractEntityTypes(Enum): KEY = auto() VALUE = auto() + COLUMN_HEADER = auto() @dataclass(eq=True, repr=True) From f041d4ee77613d5d75fc7515c56863ec984ec997 Mon Sep 17 00:00:00 2001 From: Doug Qian Date: Fri, 17 Feb 2023 10:14:30 -0800 Subject: [PATCH 3/5] Fixed MergedCell content --- src-python/trp/__init__.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/src-python/trp/__init__.py b/src-python/trp/__init__.py index baa4daf..c70a102 100644 --- a/src-python/trp/__init__.py +++ b/src-python/trp/__init__.py @@ -109,7 +109,8 @@ class Geometry: def __init__(self, geometry): boundingBox = geometry["BoundingBox"] polygon = geometry["Polygon"] - bb = BoundingBox(boundingBox["Width"], boundingBox["Height"], boundingBox["Left"], boundingBox["Top"]) + bb = BoundingBox( + boundingBox["Width"], boundingBox["Height"], boundingBox["Left"], boundingBox["Top"]) pgs = [] for pg in polygon: pgs.append(Polygon(pg["X"], pg["Y"])) @@ -249,7 +250,8 @@ def __init__(self, block, blockMap): if ('Relationships' in vkvs): for vitem in vkvs['Relationships']: if (vitem["Type"] == "CHILD"): - self._value = FieldValue(vkvs, vitem['Ids'], blockMap) + self._value = FieldValue( + vkvs, vitem['Ids'], blockMap) else: logger.warning(f"no 'Relationships' in block: {block}") @@ -391,12 +393,13 @@ def __init__(self, block, blockMap, rows): for cid in rs['Ids']: blockType = blockMap[cid]["BlockType"] if (blockType == "CELL"): - child_cell = next((x for x in cells if x.id == cid), None) + child_cell = next( + (x for x in cells if x.id == cid), None) if child_cell != None: child_cell._isChildOfMergedCell = True child_cell._mergedCellParent = self - if len(self._text) == 0 and len(child_cell.text) > 0: - self._text = child_cell.text.strip() + if len(child_cell.text) > 0: + self._text += " " + child_cell.text.strip() if ('EntityTypes' in block and block['EntityTypes']): self._entityTypes = block['EntityTypes'] @@ -437,10 +440,12 @@ def __init__(self, block, blockMap): for cid in rs['Ids']: cell = Cell(blockMap[cid], blockMap) cells.append(cell) - cells.sort(key=lambda cell: (cell.rowIndex, cell.columnIndex)) + cells.sort(key=lambda cell: ( + cell.rowIndex, cell.columnIndex)) for row_index in range(1, max([x.rowIndex for x in cells]) + 1): new_row: Row = Row() - new_row.cells = [x for x in cells if x.rowIndex == row_index] + new_row.cells = [ + x for x in cells if x.rowIndex == row_index] self._rows.append(new_row) elif (rs['Type'] == 'MERGED_CELL'): self._merged_cells_ids = rs['Ids'] @@ -562,11 +567,12 @@ def getLinesInReadingOrder(self): for index, column in enumerate(columns): bbox_left = item.geometry.boundingBox.left bbox_right = item.geometry.boundingBox.left + item.geometry.boundingBox.width - bbox_centre = item.geometry.boundingBox.left + item.geometry.boundingBox.width / 2 + bbox_centre = item.geometry.boundingBox.left + \ + item.geometry.boundingBox.width / 2 column_centre = column['left'] + column['right'] / 2 if (bbox_centre > column['left'] and bbox_centre < column['right']) or (column_centre > bbox_left and column_centre < bbox_right): - #Bbox appears inside the column + # Bbox appears inside the column lines.append([index, item.text]) column_found = True break @@ -663,7 +669,8 @@ def _parseDocumentPagesAndBlockMap(self): if documentPage: documentPage.append(block) else: - logger.error("assumed documentPage not None, but was None") + logger.error( + "assumed documentPage not None, but was None") if (documentPage): documentPages.append({"Blocks": documentPage}) return documentPages, blockMap From 17db500728209c14626ceacd53ed8c3b475a38f4 Mon Sep 17 00:00:00 2001 From: Doug Qian Date: Wed, 1 Mar 2023 14:21:34 -0800 Subject: [PATCH 4/5] Add new header width enum option WIDE --- src-python/trp/t_tables.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src-python/trp/t_tables.py b/src-python/trp/t_tables.py index 7a27cde..8fc1685 100644 --- a/src-python/trp/t_tables.py +++ b/src-python/trp/t_tables.py @@ -15,6 +15,7 @@ class HeaderFooterType(Enum): NONE = 0 NARROW = 0.5 NORMAL = 1 + WIDE = 2.5 logger = logging.getLogger(__name__) From 027228de2a8b3a4690a52a8822beb516f39fbf60 Mon Sep 17 00:00:00 2001 From: Doug Qian Date: Wed, 1 Mar 2023 15:12:27 -0800 Subject: [PATCH 5/5] Add deleteByBlockId method --- src-python/trp/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src-python/trp/__init__.py b/src-python/trp/__init__.py index c70a102..ddd7d58 100644 --- a/src-python/trp/__init__.py +++ b/src-python/trp/__init__.py @@ -699,3 +699,6 @@ def getBlockById(self, blockId): if (self._blockMap and blockId in self._blockMap): block = self._blockMap[blockId] return block + + def deleteBlockById(self, blockId): + self._blockMap.pop(blockId, None)