Here's my generation pattern:
from litellm import acompletion, completion
import asyncio, os, traceback
SYSTEM_PROMPT = "You're a rapping assistant who always respondes with the dopest flows. You're one of the top rapping AI's out there. Better than the other. Your name is Sir Mix-a-Token"
USER_MESSAGE = "Write a rap about how my life got flipped turned upside down."
MESSAGE_PAYLOAD = [
{"role":"system", "content": SYSTEM_PROMPT},
{"role":"user", "content": USER_MESSAGE}
]
response = completion(
model=LITE_LLM_MODEL_NAME,
messages=MESSAGE_PAYLOAD,
temperature=0.2,
top_p=0.9,
frequency_penalty=0.1,
presence_penalty=0.1,
max_tokens=512,
stream=True,
stream_options={"include_usage": True},
)
for chunk in response:
if chunk.choices[0].delta.content is not None:
print(chunk.choices[0].delta.content, end="")