Troubleshooting DataFrame Saving and Loading in Jupyter Notebooks
Hi guys, first of all, thanks a lot for all the work you guys are doing. It's great to see how effortless it's capturing all the requests and storing them. I have a small question though and probably I'm doing it incorrectly. As I'm running some of my prototypes in Jupyter Notebooks I tend to save stuff in dataframes. For my run I tried to do it in the following way after running all prompts: save_traces = px.Client().get_spans_dataframe() save_traces.to_parquet('traces_step_4.parquet') When initiating a new run I try to reload it in this way: save_traces=pd.read_parquet('traces_step_4.parquet') session = px.launch_app(trace=px.TraceDataset(save_traces)) That seems to load the data I used before. However, when trying to save it again it will give an error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[50], line 1
----> 1 save_traces = px.Client().get_spans_dataframe()
3 save_traces.to_parquet('traces_step_4.parquet')
File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/session/data_extractor.py:39, in TraceDataExtractor.get_spans_dataframe(self, filter_condition, start_time, stop_time, root_spans_only, project_name)
28 def get_spans_dataframe(
29 self,
30 filter_condition: Optional[str] = None,
(...)
35 project_name: Optional[str] = None,
36 ) -> Optional[pd.DataFrame]:
37 return cast(
38 Optional[pd.DataFrame],
---> 39 self.query_spans(
40 SpanQuery().where(filter_condition or ""),
41 start_time=start_time,
42 stop_time=stop_time,
43 root_spans_only=root_spans_only,
44 project_name=project_name,
45 ),
46 )
File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/session/client.py:71, in Client.query_spans(self, start_time, stop_time, root_spans_only, project_name, *queries)
69 queries = (SpanQuery(),)
70 if self._use_active_session_if_available and (session := px.active_session()):
---> 71 return session.query_spans(
72 *queries,
73 start_time=start_time,
74 stop_time=stop_time,
75 root_spans_only=root_spans_only,
76 project_name=project_name,
77 )
78 response = self._session.get(
79 url=urljoin(self._base_url, "/v1/spans"),
80 json={
(...)
86 },
87 )
88 if response.status_code == 404:
File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/session/session.py:379, in ThreadSession.query_spans(self, start_time, stop_time, root_spans_only, project_name, *queries)
370 valid_eval_names = project.get_span_evaluation_names() if project else ()
371 queries = tuple(
372 SpanQuery.from_dict(
373 query.to_dict(),
(...)
377 for query in queries
378 )
--> 379 results = query_spans(
380 project,
381 *queries,
382 start_time=start_time,
383 stop_time=stop_time,
384 root_spans_only=root_spans_only,
385 )
386 if len(results) == 1:
387 df = results[0]
File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/utilities/__init__.py:26, in query_spans(project, start_time, stop_time, root_spans_only, *queries)
18 return []
19 spans = tuple(
20 project.get_spans(
21 start_time=start_time,
(...)
24 )
25 )
---> 26 return [query(spans) for query in queries]
File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/utilities/__init__.py:26, in <listcomp>(.0)
18 return []
19 spans = tuple(
20 project.get_spans(
21 start_time=start_time,
(...)
24 )
25 )
---> 26 return [query(spans) for query in queries]
File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/dsl/query.py:294, in SpanQuery.__call__(self, spans)
289 spans = filter(
290 lambda span: (isinstance(seq := self._concat.value(span), Sequence) and len(seq)),
291 spans,
292 )
293 if not (self._select or self._explode or self._concat):
--> 294 if not (data := [json.loads(span_to_json(span)) for span in spans]):
295 return pd.DataFrame()
296 return (
297 pd.json_normalize(data, max_level=1)
298 .rename(self._rename, axis=1, errors="ignore")
299 .set_index("context.span_id", drop=False)
300 )
File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/dsl/query.py:294, in <listcomp>(.0)
289 spans = filter(
290 lambda span: (isinstance(seq := self._concat.value(span), Sequence) and len(seq)),
291 spans,
292 )
293 if not (self._select or self._explode or self._concat):
--> 294 if not (data := [json.loads(span_to_json(span)) for span in spans]):
295 return pd.DataFrame()
296 return (
297 pd.json_normalize(data, max_level=1)
298 .rename(self._rename, axis=1, errors="ignore")
299 .set_index("context.span_id", drop=False)
300 )
File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/span_json_encoder.py:52, in span_to_json(span)
51 def span_to_json(span: Span) -> str:
---> 52 return json.dumps(span, cls=SpanJSONEncoder)
File ~/pytorch-test/env/lib/python3.9/json/__init__.py:234, in dumps(obj, skipkeys, ensure_ascii, check_circular, allow_nan, cls, indent, separators, default, sort_keys, **kw)
232 if cls is None:
233 cls = JSONEncoder
--> 234 return cls(
235 skipkeys=skipkeys, ensure_ascii=ensure_ascii,
236 check_circular=check_circular, allow_nan=allow_nan, indent=indent,
237 separators=separators, default=default, sort_keys=sort_keys,
238 **kw).encode(obj)
File ~/pytorch-test/env/lib/python3.9/json/encoder.py:199, in JSONEncoder.encode(self, o)
195 return encode_basestring(o)
196 # This doesn't pass the iterator directly to ''.join() because the
197 # exceptions aren't as detailed. The list call should be roughly
198 # equivalent to the PySequence_Fast that ''.join() would do.
--> 199 chunks = self.iterencode(o, _one_shot=True)
200 if not isinstance(chunks, (list, tuple)):
201 chunks = list(chunks)
File ~/pytorch-test/env/lib/python3.9/json/encoder.py:257, in JSONEncoder.iterencode(self, o, _one_shot)
252 else:
253 _iterencode = _make_iterencode(
254 markers, self.default, _encoder, self.indent, floatstr,
255 self.key_separator, self.item_separator, self.sort_keys,
256 self.skipkeys, _one_shot)
--> 257 return _iterencode(o, 0)
File ~/pytorch-test/env/lib/python3.9/site-packages/phoenix/trace/span_json_encoder.py:48, in SpanJSONEncoder.default(self, obj)
46 elif isinstance(obj, SpanConversationAttributes):
47 return {"conversation_id": str(obj.conversation_id)}
---> 48 return super().default(obj)
File ~/pytorch-test/env/lib/python3.9/json/encoder.py:179, in JSONEncoder.default(self, o)
160 def default(self, o):
161 """Implement this method in a subclass such that it returns
162 a serializable object for ``o``, or calls the base implementation
163 (to raise a ``TypeError``).
(...)
177
178 """
--> 179 raise TypeError(f'Object of type {o.__class__.__name__} '
180 f'is not JSON serializable')
TypeError: Object of type ndarray is not JSON serializableI'm pretty sure it lays somewhere in the conversion from logs to dataframe to parquet file and back or in the way I'm doing it (I saw Roger Y. writing here an alternative approach: https://arize-ai.slack.com/archives/C04R3GXC8HK/p1709691603123049?thread_ts=1709688400.330719&cid=C04R3GXC8HK ) which didn't work in my instance either when I tried to save it. I hoped maybe you could help me apply the easiest solution / way to load the file and append new logs?
