from runpod_flash import Endpoint, GpuType, NetworkVolume
vol = NetworkVolume(name="model-storage")
@Endpoint(
name="model-server",
gpu=GpuType.NVIDIA_A100_80GB_PCIe,
volume=vol,
dependencies=["torch", "transformers"]
)
async def run_inference(prompt: str) -> dict:
from transformers import AutoModelForCausalLM, AutoTokenizer
# Load model from network volume
# Persists across worker restarts and shared between workers
model_path = "/runpod-volume/models/llama-7b"
model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Run inference
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_length=100)
text = tokenizer.decode(outputs[0])
return {"generated_text": text}