From b944d17ca888676ccc71fcdf5f39e6cc954f7e62 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Sat, 29 Nov 2025 00:12:47 -0800 Subject: [PATCH 1/7] Refactor vllm sm endpoint test to use pytest Signed-off-by: Sally Seok --- .github/workflows/pr-sglang.yml | 1 - .github/workflows/pr-vllm.yml | 3 +- test/sglang/sagemaker/test_sm_endpoint.py | 28 +- test/test_utils/__init__.py | 21 ++ test/test_utils/constants.py | 2 + test/vllm/sagemaker/test_sm_endpoint.py | 355 ++++++++-------------- 6 files changed, 156 insertions(+), 254 deletions(-) diff --git a/.github/workflows/pr-sglang.yml b/.github/workflows/pr-sglang.yml index f20099e50fa4..9bf6918eacf6 100644 --- a/.github/workflows/pr-sglang.yml +++ b/.github/workflows/pr-sglang.yml @@ -276,7 +276,6 @@ jobs: run: | uv venv source .venv/bin/activate - uv pip install -r test/requirements.txt uv pip install -r test/sglang/sagemaker/requirements.txt diff --git a/.github/workflows/pr-vllm.yml b/.github/workflows/pr-vllm.yml index bc28a6e405e6..e260f9cc4ec6 100644 --- a/.github/workflows/pr-vllm.yml +++ b/.github/workflows/pr-vllm.yml @@ -951,4 +951,5 @@ jobs: - name: Run sagemaker endpoint test run: | source .venv/bin/activate - python test/vllm/sagemaker/test_sm_endpoint.py --image-uri ${{ needs.set-sagemaker-test-environment.outputs.image-uri }} --endpoint-name test-sm-vllm-endpoint-${{ github.sha }} + cd test/ + python3 -m pytest -vs -rA --image-uri ${{ needs.set-sagemaker-test-environment.outputs.image-uri }} vllm/sagemaker diff --git a/test/sglang/sagemaker/test_sm_endpoint.py b/test/sglang/sagemaker/test_sm_endpoint.py index f5b0516235ad..07f4f802688e 100644 --- a/test/sglang/sagemaker/test_sm_endpoint.py +++ b/test/sglang/sagemaker/test_sm_endpoint.py @@ -17,11 +17,11 @@ from pprint import pformat import pytest -from botocore.exceptions import ClientError from sagemaker.model import Model from sagemaker.predictor import Predictor from sagemaker.serializers import JSONSerializer -from test_utils import clean_string, random_suffix_name, wait_for_status +from test_utils import clean_string, get_hf_token, random_suffix_name, wait_for_status +from test_utils.constants import INFERENCE_AMI_VERSION, SAGEMAKER_ROLE # To enable debugging, change logging.INFO to logging.DEBUG LOGGER = logging.getLogger(__name__) @@ -38,23 +38,6 @@ def get_endpoint_status(sagemaker_client, endpoint_name): return response["EndpointStatus"] -def get_hf_token(aws_session): - LOGGER.info("Retrieving HuggingFace token from AWS Secrets Manager...") - token_path = "test/hf_token" - - try: - get_secret_value_response = aws_session.secretsmanager.get_secret_value(SecretId=token_path) - LOGGER.info("Successfully retrieved HuggingFace token") - except ClientError as e: - LOGGER.error(f"Failed to retrieve HuggingFace token: {e}") - raise e - - # Do not print secrets token in logs - response = json.loads(get_secret_value_response["SecretString"]) - token = response.get("HF_TOKEN") - return token - - @pytest.fixture(scope="function") def model_id(request): # Return the model_id given by the test parameter @@ -63,14 +46,13 @@ def model_id(request): @pytest.fixture(scope="function") def instance_type(request): - # Return the model_id given by the test parameter + # Return the instance_type given by the test parameter return request.param @pytest.fixture(scope="function") def model_package(aws_session, image_uri, model_id): sagemaker_client = aws_session.sagemaker - sagemaker_role = aws_session.iam_resource.Role("SageMakerRole").arn cleaned_id = clean_string(model_id.split("/")[1], "_./") model_name = random_suffix_name(f"sglang-{cleaned_id}-model-package", 50) @@ -82,7 +64,7 @@ def model_package(aws_session, image_uri, model_id): model = Model( name=model_name, image_uri=image_uri, - role=sagemaker_role, + role=SAGEMAKER_ROLE, predictor_cls=Predictor, env={ "SM_SGLANG_MODEL_PATH": model_id, @@ -111,7 +93,7 @@ def model_endpoint(aws_session, model_package, instance_type): instance_type=instance_type, initial_instance_count=1, endpoint_name=endpoint_name, - inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", + inference_ami_version=INFERENCE_AMI_VERSION, serializer=JSONSerializer(), wait=True, ) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 7c307b1b8259..4cae0dbfd6a9 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -16,6 +16,7 @@ When necessary, use docstrings to explain the functions' mechanisms. """ +import json import logging import random import string @@ -23,6 +24,9 @@ from collections.abc import Callable from typing import Any +from aws import AWSSessionManager +from botocore.exceptions import ClientError + LOGGER = logging.getLogger(__name__) LOGGER.setLevel(logging.INFO) @@ -58,3 +62,20 @@ def wait_for_status( LOGGER.error(f"Wait for status: {expected_status} timed out. Actual status: {actual_status}") return False + + +def get_hf_token(aws_session: AWSSessionManager) -> str: + LOGGER.info("Retrieving HuggingFace token from AWS Secrets Manager...") + token_path = "test/hf_token" + + try: + get_secret_value_response = aws_session.secretsmanager.get_secret_value(SecretId=token_path) + LOGGER.info("Successfully retrieved HuggingFace token") + except ClientError as e: + LOGGER.error(f"Failed to retrieve HuggingFace token: {e}") + raise e + + # Do not print secrets token in logs + response = json.loads(get_secret_value_response["SecretString"]) + token = response.get("HF_TOKEN") + return token diff --git a/test/test_utils/constants.py b/test/test_utils/constants.py index eaa7babdfaf9..36dbdfad230f 100644 --- a/test/test_utils/constants.py +++ b/test/test_utils/constants.py @@ -1 +1,3 @@ DEFAULT_REGION = "us-west-2" +SAGEMAKER_ROLE = "SageMakerRole" +INFERENCE_AMI_VERSION = "al2-ami-sagemaker-inference-gpu-3-1" diff --git a/test/vllm/sagemaker/test_sm_endpoint.py b/test/vllm/sagemaker/test_sm_endpoint.py index 076071163caa..13eabc17167e 100644 --- a/test/vllm/sagemaker/test_sm_endpoint.py +++ b/test/vllm/sagemaker/test_sm_endpoint.py @@ -12,243 +12,140 @@ # language governing permissions and limitations under the License. """Integration test for serving endpoint with vLLM DLC""" -import argparse import json -import time +import logging +from pprint import pformat -import boto3 -from botocore.exceptions import ClientError -from sagemaker import serializers +import pytest from sagemaker.model import Model from sagemaker.predictor import Predictor +from sagemaker.serializers import JSONSerializer +from test_utils import clean_string, get_hf_token, random_suffix_name, wait_for_status +from test_utils.constants import INFERENCE_AMI_VERSION, SAGEMAKER_ROLE -# Fixed parameters -AWS_REGION = "us-west-2" -INSTANCE_TYPE = "ml.g5.12xlarge" -ROLE = "SageMakerRole" - - -def get_secret_hf_token(): - print("Retrieving HuggingFace token from AWS Secrets Manager...") - secret_name = "test/hf_token" - region_name = "us-west-2" - - session = boto3.session.Session() - client = session.client(service_name="secretsmanager", region_name=region_name) - try: - get_secret_value_response = client.get_secret_value(SecretId=secret_name) - print("Successfully retrieved HuggingFace token") - except ClientError as e: - print(f"Failed to retrieve HuggingFace token: {e}") - raise e - - response = json.loads(get_secret_value_response["SecretString"]) - return response - - -def deploy_endpoint(name, image_uri, role, instance_type): - try: - print(f"Starting deployment of endpoint: {name}") - print(f"Using image: {image_uri}") - print(f"Instance type: {instance_type}") - - response = get_secret_hf_token() - hf_token = response.get("HF_TOKEN") - print("Creating SageMaker model...") - - model = Model( - name=name, - image_uri=image_uri, - role=role, - env={ - "SM_VLLM_MODEL": "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", - "SM_VLLM_HF_TOKEN": hf_token, - }, - ) - print("Model created successfully") - print("Starting endpoint deployment (this may take 10-15 minutes)...") - - model.deploy( - instance_type=instance_type, - initial_instance_count=1, - endpoint_name=name, - inference_ami_version="al2-ami-sagemaker-inference-gpu-3-1", - wait=True, - ) - print("Endpoint deployment completed successfully") - return True - except Exception as e: - print(f"Deployment failed: {str(e)}") - return False - - -def invoke_endpoint(endpoint_name, prompt, max_tokens=2400, temperature=0.01): - try: - print(f"Creating predictor for endpoint: {endpoint_name}") - predictor = Predictor( - endpoint_name=endpoint_name, - serializer=serializers.JSONSerializer(), - ) - - payload = { - "messages": [{"role": "user", "content": prompt}], - "max_tokens": max_tokens, - "temperature": temperature, - "top_p": 0.9, - "top_k": 50, - } - print(f"Sending inference request with prompt: '{prompt[:50]}...'") - print(f"Request parameters: max_tokens={max_tokens}, temperature={temperature}") - - response = predictor.predict(payload) - print("Inference request completed successfully") - - if isinstance(response, bytes): - response = response.decode("utf-8") - - if isinstance(response, str): - try: - response = json.loads(response) - except json.JSONDecodeError: - print("Warning: Response is not valid JSON. Returning as string.") - - return response - except Exception as e: - print(f"Inference failed: {str(e)}") - return None - - -def delete_endpoint(endpoint_name): - try: - sagemaker_client = boto3.client("sagemaker", region_name=AWS_REGION) - - print(f"Deleting endpoint: {endpoint_name}") - sagemaker_client.delete_endpoint(EndpointName=endpoint_name) - - print(f"Deleting endpoint configuration: {endpoint_name}") - sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name) - - print(f"Deleting model: {endpoint_name}") - sagemaker_client.delete_model(ModelName=endpoint_name) - - print("Successfully deleted all resources") - return True - except Exception as e: - print(f"Error during deletion: {str(e)}") - return False - - -def wait_for_endpoint(endpoint_name, timeout=1800): - sagemaker_client = boto3.client("sagemaker", region_name=AWS_REGION) - start_time = time.time() - - while time.time() - start_time < timeout: - try: - response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name) - status = response["EndpointStatus"] - - if status == "InService": - return True - elif status in ["Failed", "OutOfService"]: - print(f"Endpoint creation failed with status: {status}") - return False - - print(f"Endpoint status: {status}. Waiting...") - time.sleep(30) - except Exception as e: - print(f"Error checking endpoint status: {str(e)}") - return False - - print("Timeout waiting for endpoint to be ready") - return False - - -def test_vllm_on_sagemaker(image_uri, endpoint_name): - print("\n" + "=" * 80) - print("STARTING vLLM SAGEMAKER ENDPOINT TEST".center(80)) - print("=" * 80) - print("Test Configuration:") - print(f" Image URI: {image_uri}") - print(f" Endpoint name: {endpoint_name}") - print(f" Region: {AWS_REGION}") - print(f" Instance type: {INSTANCE_TYPE}") - print("\n" + "-" * 80) - print("PHASE 1: ENDPOINT DEPLOYMENT".center(80)) - print("-" * 80) - - if not deploy_endpoint(endpoint_name, image_uri, ROLE, INSTANCE_TYPE): - print("\n" + "=" * 80) - print("DEPLOYMENT FAILED - CLEANING UP".center(80)) - print("=" * 80) - # Cleanup any partially created resources - delete_endpoint(endpoint_name) - raise Exception("SageMaker endpoint deployment failed") - - print("\n" + "-" * 80) - print("PHASE 2: WAITING FOR ENDPOINT READINESS".center(80)) - print("-" * 80) - if not wait_for_endpoint(endpoint_name): - print("\nEndpoint failed to become ready. Initiating cleanup...") - delete_endpoint(endpoint_name) - print("\n" + "=" * 80) - print("ENDPOINT READINESS FAILED".center(80)) - print("=" * 80) - raise Exception("SageMaker endpoint failed to become ready") - - print("\nEndpoint is ready for inference!") - print("\n" + "-" * 80) - print("PHASE 3: TESTING INFERENCE".center(80)) - print("-" * 80) - test_prompt = "Write a python script to calculate square of n" - - response = invoke_endpoint( - endpoint_name=endpoint_name, prompt=test_prompt, max_tokens=2400, temperature=0.01 +# To enable debugging, change logging.INFO to logging.DEBUG +LOGGER = logging.getLogger(__name__) +LOGGER.setLevel(logging.INFO) + +ENDPOINT_WAIT_PERIOD = 60 +ENDPOINT_WAIT_LENGTH = 30 +ENDPOINT_INSERVICE = "InService" + + +def get_endpoint_status(sagemaker_client, endpoint_name): + response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name) + LOGGER.debug(f"Describe endpoint response: {pformat(response)}") + return response["EndpointStatus"] + + +@pytest.fixture(scope="function") +def model_id(request): + # Return the model_id given by the test parameter + return request.param + + +@pytest.fixture(scope="function") +def instance_type(request): + # Return the instance_type given by the test parameter + return request.param + + +@pytest.fixture(scope="function") +def model_package(aws_session, image_uri, model_id): + sagemaker_client = aws_session.sagemaker + cleaned_id = clean_string(model_id.split("/")[1], "_./") + model_name = random_suffix_name(f"vllm-{cleaned_id}-model-package", 50) + + LOGGER.debug(f"Using image: {image_uri}") + LOGGER.debug(f"Model ID: {model_id}") + + LOGGER.info(f"Creating SageMaker model: {model_name}...") + hf_token = get_hf_token(aws_session) + model = Model( + name=model_name, + image_uri=image_uri, + role=SAGEMAKER_ROLE, + predictor_cls=Predictor, + env={ + "SM_VLLM_MODEL": model_id, + "SM_VLLM_HF_TOKEN": hf_token, + }, ) + LOGGER.info("Model created successfully") + + yield model + + LOGGER.info(f"Deleting model: {model_name}") + sagemaker_client.delete_model(ModelName=model_name) - if response: - print("\n Inference test successful!") - print("\n Response from endpoint:") - print("-" * 40) - if isinstance(response, (dict, list)): - print(json.dumps(response, indent=2)) - else: - print(response) - print("-" * 40) - - print("\n" + "-" * 80) - print(" PHASE 4: CLEANUP".center(80)) - print("-" * 80) - if delete_endpoint(endpoint_name): - print("\n" + "=" * 80) - print(" TEST COMPLETED SUCCESSFULLY! ".center(80)) - print("=" * 80) - else: - print("\n Cleanup failed") - else: - print("\n No response received from the endpoint.") - print("\n" + "-" * 80) - print(" PHASE 4: CLEANUP (FAILED INFERENCE)".center(80)) - print("-" * 80) - delete_endpoint(endpoint_name) - print("\n" + "=" * 80) - print(" TEST FAILED ".center(80)) - print("=" * 80) - raise Exception("SageMaker endpoint inference test failed") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Test vLLM SageMaker endpoint deployment and inference" + +@pytest.fixture(scope="function") +def model_endpoint(aws_session, model_package, instance_type): + sagemaker_client = aws_session.sagemaker + model = model_package + cleaned_instance = clean_string(instance_type, "_./") + endpoint_name = random_suffix_name(f"vllm-{cleaned_instance}-endpoint", 50) + + LOGGER.debug(f"Using instance type: {instance_type}") + + LOGGER.info("Starting endpoint deployment (this may take 10-15 minutes)...") + predictor = model.deploy( + instance_type=instance_type, + initial_instance_count=1, + endpoint_name=endpoint_name, + inference_ami_version=INFERENCE_AMI_VERSION, + serializer=JSONSerializer(), + wait=True, ) - parser.add_argument( - "--image-uri", required=True, help="Docker image URI for the vLLM SageMaker model" + LOGGER.info("Endpoint deployment completed successfully") + + LOGGER.info(f"Waiting for endpoint {ENDPOINT_INSERVICE} status ...") + assert wait_for_status( + ENDPOINT_INSERVICE, + ENDPOINT_WAIT_PERIOD, + ENDPOINT_WAIT_LENGTH, + get_endpoint_status, + sagemaker_client, + endpoint_name, ) - parser.add_argument("--endpoint-name", required=True, help="Name for the SageMaker endpoint") - args = parser.parse_args() + yield predictor + + LOGGER.info(f"Deleting endpoint: {endpoint_name}") + sagemaker_client.delete_endpoint(EndpointName=endpoint_name) + + LOGGER.info(f"Deleting endpoint configuration: {endpoint_name}") + sagemaker_client.delete_endpoint_config(EndpointConfigName=endpoint_name) + + +@pytest.mark.parametrize("instance_type", ["ml.g5.12xlarge"], indirect=True) +@pytest.mark.parametrize("model_id", ["deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"], indirect=True) +def test_vllm_sagemaker_endpoint(model_endpoint): + predictor = model_endpoint + + prompt = "Write a python script to calculate square of n" + payload = { + "messages": [{"role": "user", "content": prompt}], + "max_tokens": 2400, + "temperature": 0.01, + "top_p": 0.9, + "top_k": 50, + } + LOGGER.debug(f"Sending inference request with payload: {pformat(payload)}") + + response = predictor.predict(payload) + LOGGER.info("Inference request invoked successfully") + + if isinstance(response, bytes): + response = response.decode("utf-8") + + if isinstance(response, str): + try: + response = json.loads(response) + except json.JSONDecodeError: + LOGGER.warning("Response is not valid JSON. Returning as string.") + + assert response, "Model response is empty, failing endpoint test!" - try: - test_vllm_on_sagemaker(args.image_uri, args.endpoint_name) - except Exception as e: - print(f"\nScript failed with error: {e}") - exit(1) + LOGGER.info(f"Model response: {pformat(response)}") + LOGGER.info("Inference test successful!") From 5a5194515708d0fa8eb30db81e884e81688ef503 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Mon, 1 Dec 2025 11:30:13 -0800 Subject: [PATCH 2/7] empty commit for test Signed-off-by: Sally Seok From ac0947026b4b3885e28b647cab9197182475e7c6 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Mon, 1 Dec 2025 11:41:32 -0800 Subject: [PATCH 3/7] fix import Signed-off-by: Sally Seok --- test/test_utils/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_utils/__init__.py b/test/test_utils/__init__.py index 4cae0dbfd6a9..5e62dbb58f89 100644 --- a/test/test_utils/__init__.py +++ b/test/test_utils/__init__.py @@ -24,9 +24,10 @@ from collections.abc import Callable from typing import Any -from aws import AWSSessionManager from botocore.exceptions import ClientError +from .aws import AWSSessionManager + LOGGER = logging.getLogger(__name__) LOGGER.setLevel(logging.INFO) From b99bcdf73a35c23f51baef3a1761d47c22a6e06e Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Mon, 1 Dec 2025 16:08:50 -0800 Subject: [PATCH 4/7] empty commit for test Signed-off-by: Sally Seok From 5ddca6251345f843c6967f4d75b0655015496ec5 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Mon, 1 Dec 2025 17:53:24 -0800 Subject: [PATCH 5/7] revert sglang Signed-off-by: Sally Seok --- .github/workflows/pr-sglang.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr-sglang.yml b/.github/workflows/pr-sglang.yml index 9bf6918eacf6..f20099e50fa4 100644 --- a/.github/workflows/pr-sglang.yml +++ b/.github/workflows/pr-sglang.yml @@ -276,6 +276,7 @@ jobs: run: | uv venv source .venv/bin/activate + uv pip install -r test/requirements.txt uv pip install -r test/sglang/sagemaker/requirements.txt From c59c40fd2b2c001f310047a3a95b94a8081bc2d5 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Tue, 2 Dec 2025 12:07:37 -0800 Subject: [PATCH 6/7] skip audio test Signed-off-by: Sally Seok --- .github/workflows/pr-vllm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-vllm.yml b/.github/workflows/pr-vllm.yml index e260f9cc4ec6..f6093b54348d 100644 --- a/.github/workflows/pr-vllm.yml +++ b/.github/workflows/pr-vllm.yml @@ -331,7 +331,7 @@ jobs: python3 offline_inference/basic/chat.py python3 offline_inference/prefix_caching.py python3 offline_inference/llm_engine_example.py - python3 offline_inference/audio_language.py --seed 0 + # python3 offline_inference/audio_language.py --seed 0 python3 offline_inference/vision_language.py --seed 0 python3 offline_inference/vision_language_pooling.py --seed 0 python3 offline_inference/vision_language_multi_image.py --seed 0 From 951c5c229ccaf68f90b25d4bd1583dbdca775441 Mon Sep 17 00:00:00 2001 From: Sally Seok Date: Tue, 2 Dec 2025 13:20:34 -0800 Subject: [PATCH 7/7] add docs link comment Signed-off-by: Sally Seok --- test/test_utils/constants.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_utils/constants.py b/test/test_utils/constants.py index 36dbdfad230f..95da4aeef290 100644 --- a/test/test_utils/constants.py +++ b/test/test_utils/constants.py @@ -1,3 +1,4 @@ DEFAULT_REGION = "us-west-2" SAGEMAKER_ROLE = "SageMakerRole" +# https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_ProductionVariant.html INFERENCE_AMI_VERSION = "al2-ami-sagemaker-inference-gpu-3-1"