Skip to content

Commit d8376bf

Browse files
committed
Ran against all yaml files under the cwl v1.2 tests directory
Slim down our schemas after creation
1 parent 9b6b899 commit d8376bf

5 files changed

+347
-561
lines changed

cwl_utils/inputs_schema_gen.py

+150-14
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import logging
99
import sys
1010
import json
11+
from copy import deepcopy
1112
from pathlib import Path
1213
from typing import Any, List, Union, Dict, Optional
1314
from urllib.parse import urlparse
@@ -18,8 +19,7 @@
1819
from cwl_utils.parser import load_document_by_uri, InputArraySchemaTypes, \
1920
InputEnumSchemaTypes, InputRecordSchemaTypes, File, Directory, WorkflowInputParameter, InputRecordSchema, \
2021
InputEnumSchema, InputArraySchema, Workflow, CommandLineTool
21-
from cwl_utils.utils import sanitise_schema_field, is_uri, to_pascal_case, get_value_from_uri, is_local_uri, \
22-
load_schema_from_uri
22+
from cwl_utils.utils import sanitise_schema_field, is_uri, to_pascal_case, get_value_from_uri, is_local_uri
2323

2424
_logger = logging.getLogger("cwl-inputs-schema-gen") # pylint: disable=invalid-name
2525
defaultStreamHandler = logging.StreamHandler() # pylint: disable=invalid-name
@@ -35,6 +35,9 @@
3535
"string": "string",
3636
"int": "integer",
3737
"float": "number",
38+
"long": "number",
39+
"double": "number",
40+
"null": "null"
3841
}
3942

4043
JSON_TEMPLATE_PATH = Path(__file__).parent.joinpath(
@@ -123,7 +126,11 @@ def generate_type_dict_from_type(self, type_item) -> Dict:
123126
return {
124127
"type": PRIMITIVE_TYPES_MAPPING[type_item]
125128
}
126-
elif type_item in ["File", "Directory"]:
129+
elif type_item in ["stdin"]:
130+
return {
131+
"$ref": f"#/definitions/File"
132+
}
133+
elif type_item in ["File", "Directory", "Any"]:
127134
return {
128135
"$ref": f"#/definitions/{type_item}"
129136
}
@@ -148,7 +155,7 @@ def generate_type_dict_from_type(self, type_item) -> Dict:
148155
return {
149156
"type": "object",
150157
"properties": {
151-
get_value_from_uri(prop.id): self.generate_type_dict_from_type(prop.type_)
158+
get_value_from_uri(prop.name): self.generate_type_dict_from_type(prop.type_)
152159
for prop in type_item.fields
153160
}
154161
}
@@ -162,6 +169,16 @@ def generate_type_dict_from_type(self, type_item) -> Dict:
162169
}
163170
else:
164171
raise ValueError(f"Unknown type: {type_item}")
172+
elif isinstance(type_item, List):
173+
# Nested schema
174+
return {
175+
"oneOf": list(
176+
map(
177+
lambda type_iter: self.generate_type_dict_from_type(type_iter),
178+
type_item
179+
)
180+
)
181+
}
165182
else:
166183
raise ValueError(f"Unknown type: {type_item}")
167184

@@ -222,7 +239,7 @@ def generate_json_schema_property_from_input_parameter(input_parameter: Workflow
222239
return JSONSchemaProperty(
223240
name=input_name,
224241
type_=input_parameter.type_,
225-
description=doc
242+
description=doc if doc is not None else ""
226243
)
227244

228245

@@ -235,10 +252,15 @@ def generate_definition_from_schema(schema: InputRecordSchema) -> Dict:
235252

236253
# Sanitise each field of the schema
237254
sanitised_fields = {}
238-
for field_key, field_value in schema.type_.get("fields").items():
255+
256+
for field in schema.fields:
239257
sanitised_fields.update(
240258
{
241-
field_key: sanitise_schema_field(field_value)
259+
get_value_from_uri(field.name): sanitise_schema_field(
260+
{
261+
"type": field.type_
262+
}
263+
)
242264
}
243265
)
244266

@@ -267,13 +289,13 @@ def generate_definition_from_schema(schema: InputRecordSchema) -> Dict:
267289
prop = JSONSchemaProperty(
268290
name=prop_name,
269291
type_=prop_obj.get("type"),
270-
description=prop_obj.get("doc"),
292+
description=prop_obj.get("doc", ""),
271293
required=required
272294
)
273295
property_list.append(prop)
274296

275297
return {
276-
to_pascal_case(schema.type_.get('name')): {
298+
to_pascal_case(get_value_from_uri(schema.name)): {
277299
"type": "object",
278300
"properties": {
279301
prop.name: prop.type_dict
@@ -307,13 +329,41 @@ def cwl_to_jsonschema(cwl_obj: Union[Workflow, CommandLineTool]) -> Any:
307329
# Load in all $imports to be referred by complex input types
308330
workflow_schema_definitions_list = list(
309331
map(
310-
lambda import_iter: generate_definition_from_schema(
311-
load_schema_from_uri(import_iter)
332+
lambda kv_schema_tuple_iter: generate_definition_from_schema(
333+
cwl_obj.loadingOptions.idx.get(kv_schema_tuple_iter[1][0])
312334
),
313-
cwl_obj.loadingOptions.imports
335+
filter(
336+
lambda idx_iter:
337+
isinstance(idx_iter[1][0], InputRecordSchemaTypes) or
338+
isinstance(idx_iter[1][0], InputArraySchemaTypes),
339+
cwl_obj.loadingOptions.idx
340+
)
314341
)
315342
)
316343

344+
if cwl_obj.requirements is not None:
345+
try:
346+
schema_def_requirement = next(
347+
filter(
348+
lambda requirement_iter: requirement_iter.class_ == "SchemaDefRequirement",
349+
cwl_obj.requirements
350+
)
351+
)
352+
353+
workflow_schema_definitions_list.extend(
354+
list(
355+
map(
356+
lambda schema_def_iter: generate_definition_from_schema(
357+
schema_def_iter
358+
),
359+
schema_def_requirement.types
360+
)
361+
)
362+
)
363+
364+
except StopIteration:
365+
pass
366+
317367
# Convert schema definitions to dict
318368
workflow_schema_definitions_dict = {}
319369
for schema_definition in workflow_schema_definitions_list:
@@ -334,7 +384,19 @@ def cwl_to_jsonschema(cwl_obj: Union[Workflow, CommandLineTool]) -> Any:
334384
{
335385
"type": "object",
336386
"properties": {
337-
prop.name: prop.type_dict
387+
prop.name: {
388+
"oneOf": [
389+
{
390+
"type": "null"
391+
},
392+
prop.type_dict
393+
]
394+
}
395+
if prop.required is False
396+
else
397+
{
398+
prop.name: prop.type_dict
399+
}
338400
for prop in properties
339401
},
340402
"required": [
@@ -350,6 +412,80 @@ def cwl_to_jsonschema(cwl_obj: Union[Workflow, CommandLineTool]) -> Any:
350412
workflow_schema_definitions_dict
351413
)
352414

415+
# Slim down the schema as required
416+
input_json_schema = slim_definitions(input_json_schema)
417+
418+
return input_json_schema
419+
420+
421+
def slim_definitions(input_json_schema: Dict) -> Dict:
422+
"""
423+
We have quite a few definitions that we're likely not using, particularly for a simple workflow.
424+
425+
Traverse the properties and return all definitions that are used.
426+
427+
Remove all other definitions
428+
"""
429+
430+
# Traverse the properties and return all definitions that are used
431+
# https://stackoverflow.com/a/77537867/6946787
432+
def _recursive_search(json_data, target_key, result=None):
433+
if result is None:
434+
result = []
435+
436+
if isinstance(json_data, dict):
437+
for key, value in json_data.items():
438+
if key == target_key:
439+
result.append(value)
440+
else:
441+
_recursive_search(value, target_key, result)
442+
elif isinstance(json_data, list):
443+
for item in json_data:
444+
_recursive_search(item, target_key, result)
445+
446+
return result
447+
448+
# Get all the property dependencies
449+
def _get_all_ref_attributes(json_object):
450+
return _recursive_search(json_object, "$ref")
451+
452+
def get_property_dependencies(
453+
property_dict: Dict,
454+
input_json_schema: Dict,
455+
existing_property_dependencies: List = None
456+
) -> List[str]:
457+
# Initialise return list
458+
if existing_property_dependencies is None:
459+
existing_property_dependencies = []
460+
461+
# All reference attributes
462+
for reference_attribute in _get_all_ref_attributes(property_dict):
463+
# Get the value from the reference attribute
464+
reference_value = get_value_from_uri(reference_attribute)
465+
# If the reference value is not in the existing property dependencies, add it
466+
if reference_value not in existing_property_dependencies:
467+
existing_property_dependencies.append(reference_value)
468+
# Get the property dependencies of the reference value
469+
existing_property_dependencies.extend(
470+
get_property_dependencies(
471+
input_json_schema['definitions'][reference_value],
472+
input_json_schema,
473+
existing_property_dependencies
474+
)
475+
)
476+
477+
return existing_property_dependencies
478+
479+
# Copy schema
480+
input_json_schema = deepcopy(input_json_schema)
481+
482+
# Get required definitions
483+
required_definitions = get_property_dependencies(input_json_schema.get("properties"), input_json_schema)
484+
485+
for definition_key in list(input_json_schema['definitions'].keys()):
486+
if definition_key not in required_definitions:
487+
del input_json_schema['definitions'][definition_key]
488+
353489
return input_json_schema
354490

355491

@@ -430,7 +566,7 @@ def run(args: argparse.Namespace) -> int:
430566
except Exception as e:
431567
_logger.exception("Failed to generate JSON Schema from CWL inputs object. Error: %s", e)
432568
return 1
433-
args.output.write(json.dumps(jsonschema, indent=2))
569+
args.output.write(json.dumps(jsonschema, indent=2) + "\n")
434570

435571
return 0
436572

0 commit comments

Comments
 (0)