Skip to content

Commit 1c94886

Browse files
TAO 5.0 Release - PyTorch
1 parent 3421ffc commit 1c94886

File tree

5 files changed

+34
-27
lines changed

5 files changed

+34
-27
lines changed

README.md

+3-5
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ Inorder to maintain a uniform development enviroment across all users, TAO Toolk
8282
```sh
8383
usage: tao_pt [-h] [--gpus GPUS] [--volume VOLUME] [--env ENV]
8484
[--mounts_file MOUNTS_FILE] [--shm_size SHM_SIZE]
85-
[--run_as_user] [--tag TAG] [--ulimit ULIMIT] [--port PORT]
85+
[--run_as_user] [--ulimit ULIMIT] [--port PORT]
8686

8787
Tool to run the pytorch container.
8888

@@ -94,7 +94,6 @@ optional arguments:
9494
--mounts_file MOUNTS_FILE Path to the mounts file.
9595
--shm_size SHM_SIZE Shared memory size for docker
9696
--run_as_user Flag to run as user
97-
--tag TAG The tag value for the local dev docker.
9897
--ulimit ULIMIT Docker ulimits for the host machine.
9998
--port PORT Port mapping (e.g. 8889:8889).
10099

@@ -123,11 +122,10 @@ cd $NV_TAO_PYTORCH_TOP/docker
123122
```
124123

125124
#### <a name='Testthenewlybuiltbasedocker'></a>Test the newly built base docker
126-
127-
The build script tags the newly built base docker with the username of the account in the user's local machine. Therefore, the developers may tests their new docker by using the `tao_pt` command with the `--tag` option.
125+
Developers may tests their new docker by using the `tao_pt` command.
128126

129127
```sh
130-
tao_pt --tag $USER -- script args
128+
tao_pt -- script args
131129
```
132130

133131
#### <a name='Updatethenewdocker'></a>Update the new docker

docker/build.sh

+5-7
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@
33
set -eo pipefail
44
cd "$( dirname "${BASH_SOURCE[0]}" )"
55

6-
registry="nvcr.io"
7-
repository="nvidia/tao/tao-toolkit"
8-
9-
tag="5.0.0-pyt-base"
10-
local_tag="$USER"
6+
# Read parameters from manifest.json
7+
registry=`jq -r '.registry' $NV_TAO_PYTORCH_TOP/docker/manifest.json`
8+
repository=`jq -r '.repository' $NV_TAO_PYTORCH_TOP/docker/manifest.json`
9+
tag=`jq -r '.tag' $NV_TAO_PYTORCH_TOP/docker/manifest.json`
1110

1211
# Build parameters.
1312
BUILD_DOCKER="0"
@@ -55,11 +54,10 @@ if [ $BUILD_DOCKER = "1" ]; then
5554
else
5655
NO_CACHE=""
5756
fi
58-
DOCKER_BUILDKIT=1 docker build --pull -f $NV_TAO_PYTORCH_TOP/docker/Dockerfile -t $registry/$repository:$local_tag $NO_CACHE \
57+
DOCKER_BUILDKIT=1 docker build --pull -f $NV_TAO_PYTORCH_TOP/docker/Dockerfile -t $registry/$repository:$tag $NO_CACHE \
5958
--network=host $NV_TAO_PYTORCH_TOP/.
6059
if [ $PUSH_DOCKER = "1" ]; then
6160
echo "Pusing docker ..."
62-
docker tag $registry/$repository:$local_tag $registry/$repository:$tag
6361
docker push $registry/$repository:$tag
6462
digest=$(docker inspect --format='{{index .RepoDigests 0}}' $registry/$repository:$tag)
6563
echo -e "\033[1;33mUpdate the digest in the manifest.json file to:\033[0m"

docker/manifest.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
22
"registry": "nvcr.io",
33
"repository": "nvidia/tao/tao-toolkit",
4-
"digest": "sha256:50ab3fdd87c17a181e7bad0595fef1f70aeef40cd40d117528be1674c6c1739f"
4+
"tag": "5.0.0-pyt-base"
55
}

runner/tao_pt.py

+20-14
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
DOCKER_REGISTRY = docker_config["registry"]
3030
DOCKER_REPOSITORY = docker_config["repository"]
31-
DOCKER_DIGEST = docker_config["digest"]
31+
DOCKER_TAG = docker_config["tag"]
3232
DOCKER_COMMAND = "docker"
3333
HOME_PATH = os.path.expanduser("~")
3434
MOUNTS_PATH = os.path.join(HOME_PATH, ".tao_mounts.json")
@@ -57,14 +57,14 @@ def format_mounts(mount_points):
5757

5858
def check_image_exists(docker_image):
5959
"""Check if the image exists locally."""
60-
check_command = '{} images | grep "\\<{}\\>" >/dev/null 2>&1'.format(DOCKER_COMMAND, docker_image)
60+
check_command = '{} images | grep "\\<{}\\>" | grep "{}" >/dev/null 2>&1'.format(DOCKER_COMMAND, docker_image, DOCKER_TAG)
6161
rc = subprocess.call(check_command, stdout=sys.stderr, shell=True)
6262
return rc == 0
6363

6464

6565
def pull_base_container(docker_image):
6666
"""Pull the default base container."""
67-
pull_command = "{} pull {}@{}".format(DOCKER_COMMAND, docker_image, DOCKER_DIGEST)
67+
pull_command = "{} pull {}:{}".format(DOCKER_COMMAND, docker_image, DOCKER_TAG)
6868
rc = subprocess.call(pull_command, stdout=sys.stderr, shell=True)
6969
return rc == 0
7070

@@ -110,16 +110,23 @@ def get_docker_gpus_prefix(gpus):
110110
return gpu_string
111111

112112

113+
def create_base_docker():
114+
"""Function to create the base docker."""
115+
create_command = "bash {}/docker/build.sh --build".format(ROOT_DIR)
116+
try:
117+
subprocess.run(create_command, stdout=sys.stderr, shell=True, check=True)
118+
except subprocess.CalledProcessError as e:
119+
raise RuntimeError(f"Container build failed with error {e}")
120+
121+
113122
def instantiate_dev_docker(gpus, mount_file,
114123
mount_cli_list,
115124
env_var_list,
116-
tag, command, ulimit=None,
125+
command, ulimit=None,
117126
shm_size="16G", run_as_user=False,
118127
port_mapping=None):
119128
"""Instiate the docker container."""
120-
docker_image = "{}/{}@{}".format(DOCKER_REGISTRY, DOCKER_REPOSITORY, DOCKER_DIGEST)
121-
if tag is not None:
122-
docker_image = "{}/{}:{}".format(DOCKER_REGISTRY, DOCKER_REPOSITORY, tag)
129+
docker_image = "{}/{}:{}".format(DOCKER_REGISTRY, DOCKER_REPOSITORY, DOCKER_TAG)
123130

124131
# Invoking the nvidia docker.
125132
gpu_string = get_docker_gpus_prefix(gpus)
@@ -195,7 +202,6 @@ def parse_cli_args(args=None):
195202
parser.add_argument("--shm_size", help="Shared memory size for docker", default="16G", type=str)
196203
parser.add_argument("--run_as_user", help="Flag to run as user", action="store_true", default=False)
197204

198-
parser.add_argument("--tag", help="The tag value for the local dev docker.", default=None, type=str)
199205
parser.add_argument("--ulimit", action='append', help="Docker ulimits for the host machine." )
200206
parser.add_argument(
201207
"--port",
@@ -221,17 +227,17 @@ def main(cl_args=None):
221227
# parse command line args.
222228
args = parse_cli_args(tao_pt_args)
223229
docker_image = "{}/{}".format(DOCKER_REGISTRY, DOCKER_REPOSITORY)
224-
if args["tag"] is not None:
225-
docker_image = "{}:{}".format(docker_image, args["tag"])
230+
226231
if not check_image_exists(docker_image):
227-
assert pull_base_container(docker_image), "The base container doesn't exist locally and " "the pull failed."
232+
if not pull_base_container(docker_image):
233+
print("The base container doesn't exist locally and the pull failed. Hence creating the base container")
234+
create_base_docker()
228235
try:
229236
instantiate_dev_docker(
230237
args["gpus"], args["mounts_file"],
231238
args["volume"], args["env"],
232-
args["tag"], command_args,
233-
args["ulimit"], args["shm_size"],
234-
args["run_as_user"],
239+
command_args, args["ulimit"],
240+
args["shm_size"], args["run_as_user"],
235241
args['port']
236242
)
237243
except subprocess.CalledProcessError:

scripts/envsetup.sh

+5
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,11 @@ function _check_tlt_pytorch_requirements(){
5151
return 1
5252
fi
5353

54+
if ! command -v jq >/dev/null; then
55+
echo -e "\033[1;31mERROR:\033[0m jq not found"
56+
return 1
57+
fi
58+
5459
# Check if docker was installed.
5560
if ! command -v docker >/dev/null; then
5661
warnings+=("docker not found")

0 commit comments

Comments
 (0)