apache · sharkdtu · Feb 25, 2025 · Feb 25, 2025 · Fokko · Mar 4, 2025
diff --git a/pyiceberg/io/pyarrow.py b/pyiceberg/io/pyarrow.py
@@ -2547,6 +2547,7 @@ def _dataframe_to_data_files(
     table_metadata: TableMetadata,
     df: pa.Table,
     io: FileIO,
+    task_id: int = 0,
     write_uuid: Optional[uuid.UUID] = None,
     counter: Optional[itertools.count[int]] = None,
 ) -> Iterable[DataFile]:
@@ -2574,7 +2575,13 @@ def _dataframe_to_data_files(
             table_metadata=table_metadata,
             tasks=iter(
                 [
-                    WriteTask(write_uuid=write_uuid, task_id=next(counter), record_batches=batches, schema=task_schema)
+                    WriteTask(
+                        task_id=task_id,
+                        write_uuid=write_uuid,
+                        counter_id=next(counter),
+                        record_batches=batches,
+                        schema=task_schema,
+                    )
                     for batches in bin_pack_arrow_table(df, target_file_size)
                 ]
             ),
@@ -2587,8 +2594,9 @@ def _dataframe_to_data_files(
             tasks=iter(
                 [
                     WriteTask(
+                        task_id=task_id,
                         write_uuid=write_uuid,
-                        task_id=next(counter),
+                        counter_id=next(counter),
                         record_batches=batches,
                         partition_key=partition.partition_key,
                         schema=task_schema,

diff --git a/pyiceberg/table/__init__.py b/pyiceberg/table/__init__.py
@@ -1864,8 +1864,9 @@ def count(self) -> int:
 class WriteTask:
     """Task with the parameters for writing a DataFile."""
 
-    write_uuid: uuid.UUID
     task_id: int
+    write_uuid: uuid.UUID
+    counter_id: int
     schema: Schema
     record_batches: List[pa.RecordBatch]
     sort_order_id: Optional[int] = None
@@ -1874,7 +1875,7 @@ class WriteTask:
     def generate_data_file_filename(self, extension: str) -> str:
-    def generate_data_file_filename(self, extension: str) -> str:
+    def generate_data_file_filename(self, extension: str, task_id: Optional[int] = None) -> str:
-    def generate_data_file_filename(self, extension: str) -> str:
+    def generate_data_file_filename(self, extension: str, task_id: Optional[int] = None) -> str:
         # Mimics the behavior in the Java API:
         # https://github.com/apache/iceberg/blob/a582968975dd30ff4917fbbe999f1be903efac02/core/src/main/java/org/apache/iceberg/io/OutputFileFactory.java#L92-L101
-        return f"00000-{self.task_id}-{self.write_uuid}.{extension}"
+        return f"00000-{self.task_id}-{self.write_uuid}-{self.counter_id:05d}.{extension}"
 
 
 @dataclass(frozen=True)