Troubleshooting Duplicate LLM Spans in Your Application

Swaraj R.

·Mar 25, 2024 09:23 PM

Anybody else have this issue where the LLM spans show up twice? Anything I'm missing ?

12 comments

· Sorted by Oldest

Roger Y.
·
Hi Swaraj. Can you share the code that you used to produces this?

Swaraj R.

from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.trace.export import SimpleSpanProcessor


def instrument():
    tracer_provider = trace_sdk.TracerProvider()
    span_exporter = OTLPSpanExporter(
        urljoin(os.environ["PHOENIX_COLLECTOR_ENDPOINT"], "/v1/traces")
    )
    span_processor = SimpleSpanProcessor(span_exporter)
    tracer_provider.add_span_processor(span_processor)
    trace_api.set_tracer_provider(tracer_provider)
    LlamaIndexInstrumentor().instrument()


if os.environ.get("PHOENIX_COLLECTOR_ENDPOINT"):
    instrument()

how I set up the instrumentation for our azure function app

Swaraj R.

I'm. using this snippet that generates the double LLM spans

    azure_open_ai = CachedAzureOpenAI(
        model="gpt-4",
        engine="gpt-4",
        api_version="2024-02-15-preview",
        temperature=0.23,
        kv_store_cache=NoOpKVStore(),
    )

    response = azure_open_ai.chat(
        messages=[ChatMessage(content="Why was 6 afraid of 7?")]
    )
    print(response)

Swaraj R.

and this is the cached azure open ai object which uses AzureOpenAI

class CachedAzureOpenAI(AzureOpenAI):
    """Cached Azure OpenAI."""

    llm_string: Optional[str] = Field(default=None)
    kv_store_cache: Optional[KVStore] = Field(default=None, private=True)

    def __init__(
        self,
        model: str,
        engine: str,
        api_version: str,
        kv_store_cache: KVStore,
        temperature: float = 0.1,
        **kwargs,
    ):
        parameters_dict = {
            "model": model,
            "engine": engine,
            "api_version": api_version,
            "temperature": temperature,
            **kwargs,
        }
        super().__init__(**parameters_dict)
        self.llm_string = str(sorted([(k, v) for k, v in parameters_dict.items()]))
        self.kv_store_cache = kv_store_cache

    @llm_chat_callback()
    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
        input_params_dict = {
            "messages": repr(messages),
            **kwargs,
        }
        input_params_str = str(sorted([(k, v) for k, v in input_params_dict.items()]))
        cache_key = f"{self.llm_string}::{input_params_str}"

        cached_value = self.kv_store_cache.get(cache_key)
        if cached_value:
            return ChatResponse.parse_raw(cached_value)

        response: ChatResponse = super().chat(messages=messages, **kwargs)

        # Raw field of the response contains objects that are not serializable
        # so we set it to None
        # We do not read the raw field from the response, so it is safe to set it to None
        response.raw = None
        self.kv_store_cache.set(cache_key, response.json())
        return response

    @llm_chat_callback()
    async def achat(
        self, messages: Sequence[ChatMessage], **kwargs: Any
    ) -> ChatResponse:
        input_params_dict = {
            "messages": repr(messages),
            **kwargs,
        }
        input_params_str = str(sorted([(k, v) for k, v in input_params_dict.items()]))
        cache_key = f"{self.llm_string}::{input_params_str}"

        cached_value = self.kv_store_cache.get(cache_key)
        if cached_value:
            return ChatResponse.parse_raw(cached_value)

        response = await super().achat(messages=messages, **kwargs)

        # Raw field of the response contains objects that are not serializable
        # so we set it to None
        # We do not read the raw field from the response, so it is safe to set it to None
        response.raw = None
        self.kv_store_cache.set(cache_key, response.json())
        return response

Swaraj R.

ah interesting if i just do

    out = AzureOpenAI(
        model="gpt-4",
        engine="gpt-4",
        api_version="2024-02-15-preview",
        temperature=0.23,
    ).chat(messages=[ChatMessage(content="Why was 6 afraid of 7?")])
    print(out)

instead of using the cached azure open ai, then I only see one span

Roger Y.
·
you can try removing this decorator
@llm_chat_callback()
Roger Y.
·
if you look at super().chat(...) it’s also decorated by the same decorator
Roger Y.
·
that’s why two spans apper
Roger Y.
·
each decorator generates a span
Roger Y.
·
i guess technically you would only get two if the cache misses

Roger Y.

you can also try to suppress the second span (when cache is missed) using the following technique. First import these

from opentelemetry.context import _SUPPRESS_INSTRUMENTATION_KEY, attach, detach, set_value

then wrap the super().chat(...) like the following

token = attach(set_value(_SUPPRESS_INSTRUMENTATION_KEY, True))
response: ChatResponse = super().chat(messages=messages, **kwargs)
detach(token)

Swaraj R.
·
Ah let me try that , thanks Roger
👍1

Roger Y.
·
Hi Swaraj. Can you share the code that you used to produces this?

Swaraj R.

from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
from opentelemetry import trace as trace_api
from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
from opentelemetry.sdk import trace as trace_sdk
from opentelemetry.sdk.trace.export import SimpleSpanProcessor


def instrument():
    tracer_provider = trace_sdk.TracerProvider()
    span_exporter = OTLPSpanExporter(
        urljoin(os.environ["PHOENIX_COLLECTOR_ENDPOINT"], "/v1/traces")
    )
    span_processor = SimpleSpanProcessor(span_exporter)
    tracer_provider.add_span_processor(span_processor)
    trace_api.set_tracer_provider(tracer_provider)
    LlamaIndexInstrumentor().instrument()


if os.environ.get("PHOENIX_COLLECTOR_ENDPOINT"):
    instrument()

how I set up the instrumentation for our azure function app

Swaraj R.

I'm. using this snippet that generates the double LLM spans

    azure_open_ai = CachedAzureOpenAI(
        model="gpt-4",
        engine="gpt-4",
        api_version="2024-02-15-preview",
        temperature=0.23,
        kv_store_cache=NoOpKVStore(),
    )

    response = azure_open_ai.chat(
        messages=[ChatMessage(content="Why was 6 afraid of 7?")]
    )
    print(response)

Swaraj R.

and this is the cached azure open ai object which uses AzureOpenAI

class CachedAzureOpenAI(AzureOpenAI):
    """Cached Azure OpenAI."""

    llm_string: Optional[str] = Field(default=None)
    kv_store_cache: Optional[KVStore] = Field(default=None, private=True)

    def __init__(
        self,
        model: str,
        engine: str,
        api_version: str,
        kv_store_cache: KVStore,
        temperature: float = 0.1,
        **kwargs,
    ):
        parameters_dict = {
            "model": model,
            "engine": engine,
            "api_version": api_version,
            "temperature": temperature,
            **kwargs,
        }
        super().__init__(**parameters_dict)
        self.llm_string = str(sorted([(k, v) for k, v in parameters_dict.items()]))
        self.kv_store_cache = kv_store_cache

    @llm_chat_callback()
    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
        input_params_dict = {
            "messages": repr(messages),
            **kwargs,
        }
        input_params_str = str(sorted([(k, v) for k, v in input_params_dict.items()]))
        cache_key = f"{self.llm_string}::{input_params_str}"

        cached_value = self.kv_store_cache.get(cache_key)
        if cached_value:
            return ChatResponse.parse_raw(cached_value)

        response: ChatResponse = super().chat(messages=messages, **kwargs)

        # Raw field of the response contains objects that are not serializable
        # so we set it to None
        # We do not read the raw field from the response, so it is safe to set it to None
        response.raw = None
        self.kv_store_cache.set(cache_key, response.json())
        return response

    @llm_chat_callback()
    async def achat(
        self, messages: Sequence[ChatMessage], **kwargs: Any
    ) -> ChatResponse:
        input_params_dict = {
            "messages": repr(messages),
            **kwargs,
        }
        input_params_str = str(sorted([(k, v) for k, v in input_params_dict.items()]))
        cache_key = f"{self.llm_string}::{input_params_str}"

        cached_value = self.kv_store_cache.get(cache_key)
        if cached_value:
            return ChatResponse.parse_raw(cached_value)

        response = await super().achat(messages=messages, **kwargs)

        # Raw field of the response contains objects that are not serializable
        # so we set it to None
        # We do not read the raw field from the response, so it is safe to set it to None
        response.raw = None
        self.kv_store_cache.set(cache_key, response.json())
        return response

Swaraj R.

ah interesting if i just do

    out = AzureOpenAI(
        model="gpt-4",
        engine="gpt-4",
        api_version="2024-02-15-preview",
        temperature=0.23,
    ).chat(messages=[ChatMessage(content="Why was 6 afraid of 7?")])
    print(out)

instead of using the cached azure open ai, then I only see one span

Roger Y.
·
you can try removing this decorator
@llm_chat_callback()
Roger Y.
·
if you look at super().chat(...) it’s also decorated by the same decorator
Roger Y.
·
that’s why two spans apper
Roger Y.
·
each decorator generates a span
Roger Y.
·
i guess technically you would only get two if the cache misses

Roger Y.

you can also try to suppress the second span (when cache is missed) using the following technique. First import these

from opentelemetry.context import _SUPPRESS_INSTRUMENTATION_KEY, attach, detach, set_value

then wrap the super().chat(...) like the following

token = attach(set_value(_SUPPRESS_INSTRUMENTATION_KEY, True))
response: ChatResponse = super().chat(messages=messages, **kwargs)
detach(token)

Swaraj R.
·
Ah let me try that , thanks Roger
👍1