1
- import copy
2
- import pdb
3
1
import datetime
4
- import logging
5
2
import urllib
6
3
import uuid
7
4
import json
8
- from io import BytesIO
9
5
from pathlib import PurePath , PurePosixPath
10
- from socket import getfqdn
11
6
from typing import (
12
7
Any ,
13
8
Dict ,
14
- Iterable ,
15
9
List ,
16
- MutableMapping ,
17
10
MutableSequence ,
18
11
Optional ,
19
12
Tuple ,
23
16
24
17
from prov .identifier import Identifier
25
18
from prov .model import PROV , PROV_LABEL , PROV_TYPE , PROV_VALUE , ProvDocument , ProvEntity
26
- from schema_salad .sourceline import SourceLine
27
- from typing_extensions import TYPE_CHECKING
28
- from tools .load_ga_export import load_ga_history_export , GalaxyJob , GalaxyDataset
19
+ from tools .load_ga_export import load_ga_history_export , GalaxyJob
29
20
from ast import literal_eval
30
21
import os
31
22
36
27
from rocrate .provenance_constants import (
37
28
ACCOUNT_UUID ,
38
29
CWLPROV ,
39
- ENCODING ,
40
- FOAF ,
41
30
METADATA ,
42
31
ORE ,
43
32
PROVENANCE ,
44
33
RO ,
45
34
SCHEMA ,
46
35
SHA1 ,
47
- SHA256 ,
48
- TEXT_PLAIN ,
49
36
UUID ,
50
37
WF4EVER ,
51
38
WFDESC ,
59
46
# from rocrate.provenance import ResearchObject
60
47
61
48
from pathlib import Path
62
- import rocrate . rocrate as roc
49
+
63
50
64
51
def posix_path (local_path : str ) -> str :
65
52
return str (PurePosixPath (Path (local_path )))
66
53
54
+
67
55
def remove_escapes (s ):
68
56
escapes = '' .join ([chr (char ) for char in range (1 , 32 )])
69
57
translator = str .maketrans ('' , '' , escapes )
70
- t = s .translate (translator )
58
+ s .translate (translator )
59
+
71
60
72
61
def reassign (d ):
73
62
for k , v in d .items ():
@@ -78,16 +67,17 @@ def reassign(d):
78
67
except ValueError :
79
68
pass
80
69
70
+
81
71
class ProvenanceProfile :
82
- """
72
+ """\
83
73
Provenance profile.
84
74
85
75
Populated from a galaxy workflow export.
86
76
"""
87
77
88
78
def __init__ (
89
79
self ,
90
- ga_export : Dict ,
80
+ ga_export : Dict ,
91
81
full_name : str = None ,
92
82
orcid : str = None ,
93
83
# prov_name: str = None,
@@ -112,12 +102,11 @@ def __init__(
112
102
self .base_uri = "arcp://uuid,%s/" % self .ro_uuid
113
103
self .document = ProvDocument ()
114
104
# TODO extract engine_uuid from galaxy, type: str
115
- self .engine_uuid = "urn:uuid:%s" % uuid .uuid4 () #type: str
105
+ self .engine_uuid = "urn:uuid:%s" % uuid .uuid4 () # type: str
116
106
self .full_name = full_name
117
107
self .workflow_run_uuid = run_uuid or uuid .uuid4 ()
118
108
self .workflow_run_uri = self .workflow_run_uuid .urn # type: str
119
-
120
- # move to separate function
109
+ # move to separate function
121
110
metadata_export = load_ga_history_export (ga_export )
122
111
self .generate_prov_doc ()
123
112
self .jobs = []
@@ -153,7 +142,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
153
142
# PROV_TYPE: FOAF["OnlineAccount"],
154
143
# TODO: change how we register galaxy version, probably a declare_version func
155
144
# self.galaxy_version = self.ga_export["jobs_attrs"][0]["galaxy_version"]
156
- # TODO: change notation to already imported namespaces?
145
+ # TODO: change notation to already imported namespaces?
157
146
self .document .add_namespace ("wfprov" , "http://purl.org/wf4ever/wfprov#" )
158
147
# document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
159
148
self .document .add_namespace ("wfdesc" , "http://purl.org/wf4ever/wfdesc#" )
@@ -176,7 +165,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
176
165
"provenance" , self .base_uri + posix_path (PROVENANCE ) + "/"
177
166
)
178
167
# TODO: use appropriate refs for ga_export and related inputs
179
- ro_identifier_workflow = self .base_uri + "ga_export" + "/"
168
+ ro_identifier_workflow = self .base_uri + "ga_export" + "/"
180
169
self .wf_ns = self .document .add_namespace ("wf" , ro_identifier_workflow )
181
170
ro_identifier_input = (
182
171
self .base_uri + "ga_export/datasets#"
@@ -240,15 +229,15 @@ def declare_process(
240
229
"""Record the start of each Process."""
241
230
if process_run_id is None :
242
231
process_run_id = uuid .uuid4 ().urn
243
-
244
- cmd = ga_export_jobs_attrs ["command_line" ]
232
+
233
+ # cmd = ga_export_jobs_attrs["command_line"]
245
234
process_name = ga_export_jobs_attrs ["tool_id" ]
246
235
tool_version = ga_export_jobs_attrs ["tool_version" ]
247
236
prov_label = "Run of " + process_name
248
237
start_time = ga_export_jobs_attrs ["create_time" ]
249
238
end_time = ga_export_jobs_attrs ["update_time" ]
250
239
251
- #TODO: Find out how to include commandline as a string
240
+ # TODO: Find out how to include commandline as a string
252
241
# cmd = self.document.entity(
253
242
# uuid.uuid4().urn,
254
243
# {PROV_TYPE: WFPROV["Artifact"], PROV_LABEL: ga_export_jobs_attrs["command_line"]}
@@ -259,9 +248,9 @@ def declare_process(
259
248
start_time ,
260
249
end_time ,
261
250
{
262
- PROV_TYPE : WFPROV ["ProcessRun" ],
263
- PROV_LABEL : prov_label ,
264
- #TODO: Find out how to include commandline as a string
251
+ PROV_TYPE : WFPROV ["ProcessRun" ],
252
+ PROV_LABEL : prov_label ,
253
+ # TODO: Find out how to include commandline as a string
265
254
# PROV_LABEL: cmd
266
255
},
267
256
)
@@ -289,7 +278,7 @@ def used_artefacts(
289
278
base += "/" + process_name
290
279
tool_id = process_metadata ["tool_id" ]
291
280
base += "/" + tool_id
292
- items = ["inputs" ,"outputs" ,"parameters" ]
281
+ items = ["inputs" , "outputs" , "parameters" ]
293
282
# print(process_metadata["params"])
294
283
for item in items :
295
284
# print(item)
@@ -317,7 +306,6 @@ def used_artefacts(
317
306
318
307
# for artefact in value:
319
308
try :
320
- # pdb.set_trace()
321
309
entity = self .declare_artefact (value )
322
310
self .document .used (
323
311
process_run_id ,
@@ -356,7 +344,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
356
344
# byte_s = BytesIO(value)
357
345
# data_file = self.research_object.add_data_file(byte_s)
358
346
# FIXME: Don't naively assume add_data_file uses hash in filename!
359
- data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
347
+ data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
360
348
return self .document .entity (
361
349
data_id ,
362
350
{PROV_TYPE : WFPROV ["Artifact" ], PROV_VALUE : str (value )},
@@ -394,7 +382,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
394
382
)
395
383
396
384
if value .get ("class" ):
397
- #_logger.warning("Unknown data class %s.", value["class"])
385
+ # _logger.warning("Unknown data class %s.", value["class"])
398
386
# FIXME: The class might be "http://example.com/somethingelse"
399
387
coll .add_asserted_type (CWLPROV [value ["class" ]])
400
388
@@ -404,7 +392,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
404
392
# clean up unwanted characters
405
393
if isinstance (key , str ):
406
394
key = key .replace ("|" , "_" )
407
- if isinstance (val , str ):
395
+ if isinstance (val , str ):
408
396
val = val .replace ("|" , "_" )
409
397
410
398
v_ent = self .declare_artefact (val )
@@ -451,7 +439,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
451
439
# FIXME: list value does not support adding "@id"
452
440
return coll
453
441
except TypeError :
454
- #_logger.warning("Unrecognized type %s of %r", type(value), value)
442
+ # _logger.warning("Unrecognized type %s of %r", type(value), value)
455
443
# Let's just fall back to Python repr()
456
444
entity = self .document .entity (uuid .uuid4 ().urn , {PROV_LABEL : repr (value )})
457
445
# self.research_object.add_uri(entity.identifier.uri)
@@ -466,7 +454,7 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
466
454
if "checksum" in value :
467
455
csum = cast (str , value ["checksum" ])
468
456
(method , checksum ) = csum .split ("$" , 1 )
469
- if method == SHA1 : # and self.research_object.has_data_file(checksum):
457
+ if method == SHA1 : # and self.research_object.has_data_file(checksum):
470
458
entity = self .document .entity ("data:" + checksum )
471
459
472
460
if not entity and "location" in value :
@@ -513,8 +501,8 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
513
501
514
502
# Check for secondaries
515
503
for sec in cast (
516
- # MutableSequence[CWLObjectType],
517
- value .get ("secondaryFiles" , [])
504
+ # MutableSequence[CWLObjectType],
505
+ value .get ("secondaryFiles" , []) # noqa
518
506
):
519
507
# TODO: Record these in a specializationOf entity with UUID?
520
508
if sec ["class" ] == "File" :
@@ -535,8 +523,10 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
535
523
536
524
return file_entity , entity , checksum
537
525
538
- def declare_directory (self
539
- # , value: CWLObjectType
526
+ def declare_directory (
527
+ self ,
528
+ # value: CWLObjectType
529
+ value
540
530
) -> ProvEntity :
541
531
"""Register any nested files/directories."""
542
532
# FIXME: Calculate a hash-like identifier for directory
@@ -647,12 +637,11 @@ def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
647
637
# checksum = PurePosixPath(data_file).name
648
638
# FIXME: Don't naively assume add_data_file uses hash in filename!
649
639
value = str (value ).replace ("|" , "_" )
650
- data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
640
+ data_id = "data:%s" % str (value ) # PurePosixPath(data_file).stem
651
641
entity = self .document .entity (
652
642
data_id , {PROV_TYPE : WFPROV ["Artifact" ], PROV_VALUE : str (value )}
653
643
) # type: ProvEntity
654
- return entity #, checksum
655
-
644
+ return entity # , checksum
656
645
657
646
def generate_output_prov (
658
647
self ,
@@ -735,7 +724,7 @@ def activity_has_provenance(self, activity, prov_ids):
735
724
self .document .activity (activity , other_attributes = attribs )
736
725
# Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention
737
726
# as prov:mentionOf() is only for entities, not activities
738
- uris = [i .uri for i in prov_ids ]
727
+ # uris = [i.uri for i in prov_ids]
739
728
# self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri)
740
729
741
730
def finalize_prov_profile (self , name = None , out_path = None ):
@@ -770,7 +759,7 @@ def finalize_prov_profile(self, name=None, out_path=None):
770
759
771
760
# https://www.w3.org/TR/prov-xml/
772
761
# serialized_prov_docs["xml"] = self.document.serialize(format="xml", indent=4)
773
- prov_ids .append (self .provenance_ns [filename + ".xml" ])
762
+ prov_ids .append (self .provenance_ns [filename + ".xml" ])
774
763
with open (basename + ".xml" , "w" ) as provenance_file :
775
764
self .document .serialize (provenance_file , format = "xml" , indent = 4 )
776
765
@@ -779,7 +768,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
779
768
prov_ids .append (self .provenance_ns [filename + ".provn" ])
780
769
with open (basename + ".provn" , "w" ) as provenance_file :
781
770
self .document .serialize (provenance_file , format = "provn" , indent = 2 )
782
-
783
771
784
772
# https://www.w3.org/Submission/prov-json/
785
773
# serialized_prov_docs["json"] = self.document.serialize(format="json", indent=2)
@@ -810,7 +798,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
810
798
prov_ids .append (self .provenance_ns [filename + ".jsonld" ])
811
799
with open (basename + ".jsonld" , "w" ) as provenance_file :
812
800
self .document .serialize (provenance_file , format = "rdf" , rdf_format = "json-ld" )
813
-
814
801
815
- #_logger.debug("[provenance] added provenance: %s", prov_ids)
802
+ # _logger.debug("[provenance] added provenance: %s", prov_ids)
816
803
return (serialized_prov_docs , prov_ids )
0 commit comments