apiVersion: ai.sap.com/v1alpha1
kind: ServingTemplate
metadata:
name: transformers
annotations:
scenarios.ai.sap.com/description: "transformers"
scenarios.ai.sap.com/name: "transformers"
executables.ai.sap.com/description: "transformers"
executables.ai.sap.com/name: "transformers"
labels:
scenarios.ai.sap.com/id: "transformers"
ai.sap.com/version: "1.0"
spec:
template:
apiVersion: "serving.kserve.io/v1beta1"
metadata:
annotations: |
autoscaling.knative.dev/metric: concurrency
autoscaling.knative.dev/target: 1
autoscaling.knative.dev/targetBurstCapacity: 0
labels: |
ai.sap.com/resourcePlan: infer.l
spec: |
predictor:
imagePullSecrets:
- name: felixdockersecrect
minReplicas: 1
maxReplicas: 5
containers:
- name: kserve-container
image: docker.io/bfwork/huggingcore-customgpu
ports:
- containerPort: 8080
protocol: TCP
FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04
LABEL maintainer="Hugging Face"
ARG DEBIAN_FRONTEND=noninteractive
RUN apt update
RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
RUN python3 -m pip install --no-cache-dir --upgrade pip
RUN python3 -m pip install --no-cache-dir transformers==4.31.0
# If set to nothing, will install the latest version
ARG PYTORCH='2.0.1'
ARG TORCH_VISION=''
ARG TORCH_AUDIO=''
# Example: `cu102`, `cu113`, etc.
ARG CUDA='cu117'
RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' || VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' || VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
RUN python3 -m pip uninstall -y tensorflow flax
RUN python3 -m pip install -U "itsdangerous<2.1.0"
RUN python3 -m pip install -U accelerate einops bitsandbytes-cuda117
FROM bfwork/huggingcore-transformers
WORKDIR /serving
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt
ENV LC_ALL=C.UTF-8
ENV LANG=C.UTF-8
RUN export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda-10.0/targets/x86_64-linux/lib:/usr/local/cuda-10.2/targets/x86_64-linux/lib:/usr/local/cuda-11/targets/x86_64-linux/lib:/usr/local/cuda-11.6/targets/x86_64-linux/lib/stubs:/usr/local/cuda-11.6/compat:/usr/local/cuda-11.6/targets/x86_64-linux/lib
RUN export PATH=$PATH:/usr/local/cuda-11/bin
# Required for huggingface
RUN mkdir -p /nonexistent/
RUN mkdir -p /transformerscache/
RUN chown -R 1000:1000 /nonexistent
RUN chown -R 1000:1000 /transformerscache
RUN chmod -R 777 /nonexistent
RUN chmod -R 777 /transformerscache
ENV TRANSFORMERS_CACHE=/transformerscache
COPY /serving /serving
ENV MODEL_CLASS="PIPELINE"
CMD ["uvicorn", "app:api", "--host", "0.0.0.0", "--port", "8080"]
LD_LIBRARY_PATH
variable, which is necessary for AI Core. Furthermore, we create various folders and configure permissions to enable writing to them. These steps are crucial as Transformers downloads model files to the disk, using the directory specified by the TRANSFORMERS_CACHE
environment variable. For a detailed explaination of the Dockerfile, check out my last blog post.from fastapi import FastAPI, Request
from model_pipeline import Model
api = FastAPI()
@api.on_event("startup")
async def on_app_start():
"""this function is called on startup and facilitates the loading and setup of the model for inference"""
Model.setup()
@api.post("/v2/predict")
async def predict(request: Request):
"""this function exposes the inference endpoint, expecting a json object with the prompt and a dictionary of arguments for the model"""
request_content = await request.json()
return Model.predict(request_content["prompt"], args=request_content["args"])
import os
import sys
import torch
import transformers
import huggingface_hub
transformers.utils.logging.set_verbosity_debug()
transformers.utils.logging.disable_progress_bar()
HUB_TOKEN = "hf_qsb<your hf token>"
huggingface_hub.login(token=HUB_TOKEN)
class Model:
generator = None
def setup():
"""model setup"""
print("START LOADING SETUP", file=sys.stderr) # somehow AI Cores logs only show the error stream 🙂
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
pipeline = transformers.pipeline(
"text-generation",
model=model_name,
tokenizer=tokenizer,
torch_dtype=torch.bfloat16,
device=device,
trust_remote_code=True,
use_auth_token=True
)
print("MODEL DEVICE", str(device), file=sys.stderr)
Model.generator = lambda prompt, args: pipeline(
prompt,
**{
"max_length": 2000,
"do_sample": True,
"top_k": 10,
"num_return_sequences": 1,
"eos_token_id": tokenizer.eos_token_id,
**args
}
)
print("SETUP DONE", file=sys.stderr)
def predict(prompt, args):
"""model setup"""
return Model.generator(prompt, args)
def build_llama2_prompt(role_prompt, task_prompt):
B_S, E_S = "<s>", " </s>"
B_INST, E_INST = "[INST]", " [/INST]\n"
B_SYS, E_SYS = " <<SYS>>\n", "\n<</SYS>>\n\n"
SYSTEM_PROMPT = B_SYS + role_prompt + E_SYS
return B_S + B_INST + SYSTEM_PROMPT + task_prompt + E_INST
def get_response(full_prompt, args={}):
res = requests.post(
f"https://api.ai.internalprod.eu-central-1.aws.ml.hana.ondemand.com/v2/inference/deployments/{deployment_resp.id}/v2/predict",
json={"prompt": full_prompt, "args": args},
headers={
"Authorization": ai_api_v2_client.rest_client.get_token(),
"ai-resource-group": RESOURCE_GROUP,
"Content-Type": "application/json"
})
if res.status_code != 200:
raise Exception("ERROR WITH DEPLOYMENT " + str(res.status_code) + " " + str(res.content))
return res.json()[0]["generated_text"]
r = get_response(build_llama2_prompt(role_prompt="You are a poet!", task_prompt="Write a 5 line Poem, about lamas!"))
<s>[INST] <<SYS>>
You are a poet!
<</SYS>>
Write a 5 line Poem, about lamas! [/INST]
Oh, lamas, oh so serene,
With coats of gold, so divine.
Their eyes so bright, their steps so light,
They roam the mountains with such grace.
In peaceful silence, they take flight.
You must be a registered user to add a comment. If you've already registered, sign in. Otherwise, register and sign in.
User | Count |
---|---|
29 | |
13 | |
12 | |
10 | |
9 | |
9 | |
7 | |
7 | |
7 | |
6 |