Troubleshooting DataFrame Saving and Loading in Jupyter Notebooks

·Mar 14, 2024 06:23 AM

Hi guys, first of all, thanks a lot for all the work you guys are doing. It's great to see how effortless it's capturing all the requests and storing them. I have a small question though and probably I'm doing it incorrectly. As I'm running some of my prototypes in Jupyter Notebooks I tend to save stuff in dataframes. For my run I tried to do it in the following way after running all prompts: save_traces = px.Client().get_spans_dataframe() save_traces.to_parquet('traces_step_4.parquet') When initiating a new run I try to reload it in this way: save_traces=pd.read_parquet('traces_step_4.parquet') session = px.launch_app(trace=px.TraceDataset(save_traces)) That seems to load the data I used before. However, when trying to save it again it will give an error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[50], line 1
----> 1 save_traces = px.Client().get_spans_dataframe()
      3 save_traces.to_parquet('traces_step_4.parquet')

File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/session/data_extractor.py:39, in TraceDataExtractor.get_spans_dataframe(self, filter_condition, start_time, stop_time, root_spans_only, project_name)
     28 def get_spans_dataframe(
     29     self,
     30     filter_condition: Optional[str] = None,
   (...)
     35     project_name: Optional[str] = None,
     36 ) -> Optional[pd.DataFrame]:
     37     return cast(
     38         Optional[pd.DataFrame],
---> 39         self.query_spans(
     40             SpanQuery().where(filter_condition or ""),
     41             start_time=start_time,
     42             stop_time=stop_time,
     43             root_spans_only=root_spans_only,
     44             project_name=project_name,
     45         ),
     46     )

File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/session/client.py:71, in Client.query_spans(self, start_time, stop_time, root_spans_only, project_name, *queries)
     69     queries = (SpanQuery(),)
     70 if self._use_active_session_if_available and (session := px.active_session()):
---> 71     return session.query_spans(
     72         *queries,
     73         start_time=start_time,
     74         stop_time=stop_time,
     75         root_spans_only=root_spans_only,
     76         project_name=project_name,
     77     )
     78 response = self._session.get(
     79     url=urljoin(self._base_url, "/v1/spans"),
     80     json={
   (...)
     86     },
     87 )
     88 if response.status_code == 404:

File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/session/session.py:379, in ThreadSession.query_spans(self, start_time, stop_time, root_spans_only, project_name, *queries)
    370 valid_eval_names = project.get_span_evaluation_names() if project else ()
    371 queries = tuple(
    372     SpanQuery.from_dict(
    373         query.to_dict(),
   (...)
    377     for query in queries
    378 )
--> 379 results = query_spans(
    380     project,
    381     *queries,
    382     start_time=start_time,
    383     stop_time=stop_time,
    384     root_spans_only=root_spans_only,
    385 )
    386 if len(results) == 1:
    387     df = results[0]

File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/utilities/__init__.py:26, in query_spans(project, start_time, stop_time, root_spans_only, *queries)
     18     return []
     19 spans = tuple(
     20     project.get_spans(
     21         start_time=start_time,
   (...)
     24     )
     25 )
---> 26 return [query(spans) for query in queries]

File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/utilities/__init__.py:26, in <listcomp>(.0)
     18     return []
     19 spans = tuple(
     20     project.get_spans(
     21         start_time=start_time,
   (...)
     24     )
     25 )
---> 26 return [query(spans) for query in queries]

File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/dsl/query.py:294, in SpanQuery.__call__(self, spans)
    289     spans = filter(
    290         lambda span: (isinstance(seq := self._concat.value(span), Sequence) and len(seq)),
    291         spans,
    292     )
    293 if not (self._select or self._explode or self._concat):
--> 294     if not (data := [json.loads(span_to_json(span)) for span in spans]):
    295         return pd.DataFrame()
    296     return (
    297         pd.json_normalize(data, max_level=1)
    298         .rename(self._rename, axis=1, errors="ignore")
    299         .set_index("context.span_id", drop=False)
    300     )

File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/dsl/query.py:294, in <listcomp>(.0)
    289     spans = filter(
    290         lambda span: (isinstance(seq := self._concat.value(span), Sequence) and len(seq)),
    291         spans,
    292     )
    293 if not (self._select or self._explode or self._concat):
--> 294     if not (data := [json.loads(span_to_json(span)) for span in spans]):
    295         return pd.DataFrame()
    296     return (
    297         pd.json_normalize(data, max_level=1)
    298         .rename(self._rename, axis=1, errors="ignore")
    299         .set_index("context.span_id", drop=False)
    300     )

File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/span_json_encoder.py:52, in span_to_json(span)
     51 def span_to_json(span: Span) -> str:
---> 52     return json.dumps(span, cls=SpanJSONEncoder)

File ~/pytorch-test/env/lib/python3.9/json/__init__.py:234, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
    232 if cls is None:
    233     cls = JSONEncoder
--> 234 return cls(
    235     skipkeys=skipkeys, ensure_ascii=ensure_ascii,
    236     check_circular=check_circular, allow_nan=allow_nan, indent=indent,
    237     separators=separators, default=default, sort_keys=sort_keys,
    238     **kw).encode(obj)

File ~/pytorch-test/env/lib/python3.9/json/encoder.py:199, in JSONEncoder.encode(self, o)
    195         return encode_basestring(o)
    196 # This doesn't pass the iterator directly to ''.join() because the
    197 # exceptions aren't as detailed.  The list call should be roughly
    198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
    200 if not isinstance(chunks, (list, tuple)):
    201     chunks = list(chunks)

File ~/pytorch-test/env/lib/python3.9/json/encoder.py:257, in JSONEncoder.iterencode(self, o, _one_shot)
    252 else:
    253     _iterencode = _make_iterencode(
    254         markers, self.default, _encoder, self.indent, floatstr,
    255         self.key_separator, self.item_separator, self.sort_keys,
    256         self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)

File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/span_json_encoder.py:48, in SpanJSONEncoder.default(self, obj)
     46 elif isinstance(obj, SpanConversationAttributes):
     47     return {"conversation_id": str(obj.conversation_id)}
---> 48 return super().default(obj)

File ~/pytorch-test/env/lib/python3.9/json/encoder.py:179, in JSONEncoder.default(self, o)
    160 def default(self, o):
    161     """Implement this method in a subclass such that it returns
    162     a serializable object for ``o``, or calls the base implementation
    163     (to raise a ``TypeError``).
   (...)
    177 
    178     """
--> 179     raise TypeError(f'Object of type {o.__class__.__name__} '
    180                     f'is not JSON serializable')

TypeError: Object of type ndarray is not JSON serializable

I'm pretty sure it lays somewhere in the conversion from logs to dataframe to parquet file and back or in the way I'm doing it (I saw Roger Y. writing here an alternative approach: https://arize-ai.slack.com/archives/C04R3GXC8HK/p1709691603123049?thread_ts=1709688400.330719&cid=C04R3GXC8HK ) which didn't work in my instance either when I tried to save it. I hoped maybe you could help me apply the easiest solution / way to load the file and append new logs?

--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[50], line 1 ----> 1 save_traces = px.Client().get_spans_dataframe() 3 save_traces.to_parquet('traces_step_4.parquet') File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/session/data_extractor.py:39, in TraceDataExtractor.get_spans_dataframe(self, filter_condition, start_time, stop_time, root_spans_only, project_name) 28 def get_spans_dataframe( 29 self, 30 filter_condition: Optional[str] = None, (...) 35 project_name: Optional[str] = None, 36 ) -> Optional[pd.DataFrame]: 37 return cast( 38 Optional[pd.DataFrame], ---> 39 self.query_spans( 40 SpanQuery().where(filter_condition or ""), 41 start_time=start_time, 42 stop_time=stop_time, 43 root_spans_only=root_spans_only, 44 project_name=project_name, 45 ), 46 ) File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/session/client.py:71, in Client.query_spans(self, start_time, stop_time, root_spans_only, project_name, *queries) 69 queries = (SpanQuery(),) 70 if self._use_active_session_if_available and (session := px.active_session()): ---> 71 return session.query_spans( 72 *queries, 73 start_time=start_time, 74 stop_time=stop_time, 75 root_spans_only=root_spans_only, 76 project_name=project_name, 77 ) 78 response = self._session.get( 79 url=urljoin(self._base_url, "/v1/spans"), 80 json={ (...) 86 }, 87 ) 88 if response.status_code == 404: File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/session/session.py:379, in ThreadSession.query_spans(self, start_time, stop_time, root_spans_only, project_name, *queries) 370 valid_eval_names = project.get_span_evaluation_names() if project else () 371 queries = tuple( 372 SpanQuery.from_dict( 373 query.to_dict(), (...) 377 for query in queries 378 ) --> 379 results = query_spans( 380 project, 381 *queries, 382 start_time=start_time, 383 stop_time=stop_time, 384 root_spans_only=root_spans_only, 385 ) 386 if len(results) == 1: 387 df = results[0] File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/utilities/__init__.py:26, in query_spans(project, start_time, stop_time, root_spans_only, *queries) 18 return [] 19 spans = tuple( 20 project.get_spans( 21 start_time=start_time, (...) 24 ) 25 ) ---> 26 return [query(spans) for query in queries] File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/utilities/__init__.py:26, in <listcomp>(.0) 18 return [] 19 spans = tuple( 20 project.get_spans( 21 start_time=start_time, (...) 24 ) 25 ) ---> 26 return [query(spans) for query in queries] File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/dsl/query.py:294, in SpanQuery.__call__(self, spans) 289 spans = filter( 290 lambda span: (isinstance(seq := self._concat.value(span), Sequence) and len(seq)), 291 spans, 292 ) 293 if not (self._select or self._explode or self._concat): --> 294 if not (data := [json.loads(span_to_json(span)) for span in spans]): 295 return pd.DataFrame() 296 return ( 297 pd.json_normalize(data, max_level=1) 298 .rename(self._rename, axis=1, errors="ignore") 299 .set_index("context.span_id", drop=False) 300 ) File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/dsl/query.py:294, in <listcomp>(.0) 289 spans = filter( 290 lambda span: (isinstance(seq := self._concat.value(span), Sequence) and len(seq)), 291 spans, 292 ) 293 if not (self._select or self._explode or self._concat): --> 294 if not (data := [json.loads(span_to_json(span)) for span in spans]): 295 return pd.DataFrame() 296 return ( 297 pd.json_normalize(data, max_level=1) 298 .rename(self._rename, axis=1, errors="ignore") 299 .set_index("context.span_id", drop=False) 300 ) File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/span_json_encoder.py:52, in span_to_json(span) 51 def span_to_json(span: Span) -> str: ---> 52 return json.dumps(span, cls=SpanJSONEncoder) File ~/pytorch-test/env/lib/python3.9/json/__init__.py:234, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw) 232 if cls is None: 233 cls = JSONEncoder --> 234 return cls( 235 skipkeys=skipkeys, ensure_ascii=ensure_ascii, 236 check_circular=check_circular, allow_nan=allow_nan, indent=indent, 237 separators=separators, default=default, sort_keys=sort_keys, 238 **kw).encode(obj) File ~/pytorch-test/env/lib/python3.9/json/encoder.py:199, in JSONEncoder.encode(self, o) 195 return encode_basestring(o) 196 # This doesn't pass the iterator directly to ''.join() because the 197 # exceptions aren't as detailed. The list call should be roughly 198 # equivalent to the PySequence_Fast that ''.join() would do. --> 199 chunks = self.iterencode(o, _one_shot=True) 200 if not isinstance(chunks, (list, tuple)): 201 chunks = list(chunks) File ~/pytorch-test/env/lib/python3.9/json/encoder.py:257, in JSONEncoder.iterencode(self, o, _one_shot) 252 else: 253 _iterencode = _make_iterencode( 254 markers, self.default, _encoder, self.indent, floatstr, 255 self.key_separator, self.item_separator, self.sort_keys, 256 self.skipkeys, _one_shot) --> 257 return _iterencode(o, 0) File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/span_json_encoder.py:48, in SpanJSONEncoder.default(self, obj) 46 elif isinstance(obj, SpanConversationAttributes): 47 return {"conversation_id": str(obj.conversation_id)} ---> 48 return super().default(obj) File ~/pytorch-test/env/lib/python3.9/json/encoder.py:179, in JSONEncoder.default(self, o) 160 def default(self, o): 161 """Implement this method in a subclass such that it returns 162 a serializable object for ``o``, or calls the base implementation 163 (to raise a ``TypeError``). (...) 177 178 """ --> 179 raise TypeError(f'Object of type {o.__class__.__name__} ' 180 f'is not JSON serializable') TypeError: Object of type ndarray is not JSON serializable

Troubleshooting DataFrame Saving and Loading in Jupyter Notebooks

11 comments

Troubleshooting DataFrame Saving and Loading in Jupyter Notebooks

11 comments