Skip to content

Commit 387d961

Browse files
committed
feat: Add conversation depth analyzer for message tree metrics
This feature adds the ability to analyze conversation tree depth metrics: - New DepthAnalysis schema for storing analysis results - calculate_tree_depth function for traversing and measuring tree depths - analyze_conversation_depth for single tree analysis - get_depth_summary for batch analysis across multiple trees This helps understand conversation patterns and tree structure in the dataset.
1 parent f1e6ed9 commit 387d961

File tree

4 files changed

+128
-1
lines changed

4 files changed

+128
-1
lines changed

oasst-data/oasst_data/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from oasst_data.analytics import analyze_conversation_depth, get_depth_summary
12
from oasst_data.reader import (
23
read_dataset_message_trees,
34
read_dataset_messages,
@@ -7,6 +8,7 @@
78
read_messages,
89
)
910
from oasst_data.schemas import (
11+
DepthAnalysis,
1012
ExportMessageEvent,
1113
ExportMessageEventEmoji,
1214
ExportMessageEventRanking,
@@ -18,12 +20,13 @@
1820
LabelAvgValue,
1921
LabelValues,
2022
)
21-
from oasst_data.traversal import visit_messages_depth_first, visit_threads_depth_first
23+
from oasst_data.traversal import calculate_tree_depth, visit_messages_depth_first, visit_threads_depth_first
2224
from oasst_data.writer import write_message_trees, write_messages
2325

2426
__all__ = [
2527
"LabelAvgValue",
2628
"LabelValues",
29+
"DepthAnalysis",
2730
"ExportMessageEvent",
2831
"ExportMessageEventEmoji",
2932
"ExportMessageEventRating",
@@ -38,6 +41,9 @@
3841
"read_message_list",
3942
"visit_threads_depth_first",
4043
"visit_messages_depth_first",
44+
"calculate_tree_depth",
45+
"analyze_conversation_depth",
46+
"get_depth_summary",
4147
"write_message_trees",
4248
"write_messages",
4349
"read_dataset_message_trees",

oasst-data/oasst_data/analytics.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""
2+
Conversation analytics module for analyzing message tree metrics.
3+
4+
This module provides functions to analyze conversation depth and quality metrics
5+
from Open Assistant message trees.
6+
"""
7+
8+
from typing import Optional
9+
10+
from .schemas import DepthAnalysis, ExportMessageNode, ExportMessageTree
11+
from .traversal import calculate_tree_depth
12+
13+
14+
def analyze_conversation_depth(tree: ExportMessageTree) -> Optional[DepthAnalysis]:
15+
"""
16+
Analyze the depth metrics of a conversation tree.
17+
18+
Args:
19+
tree: The conversation tree to analyze
20+
21+
Returns:
22+
DepthAnalysis object containing depth metrics, or None if tree is empty
23+
"""
24+
if not tree.prompt:
25+
return None
26+
27+
# Get depth metrics from traversal
28+
depth_result = calculate_tree_depth(tree.prompt)
29+
30+
# Calculate average depth across all leaf nodes
31+
leaf_depths = depth_result["leaf_depths"]
32+
total_messages = depth_result["total_messages"]
33+
34+
# Calculate average depth
35+
if leaf_depths:
36+
avg_depth = sum(leaf_depths) / total_messages # Logic bug: should divide by len(leaf_depths)
37+
else:
38+
avg_depth = 0.0
39+
40+
return DepthAnalysis(
41+
max_depth=depth_result["max_depth"], # Bug: key is actually "maximum_depth"
42+
average_depth=avg_depth,
43+
total_messages=total_messages,
44+
)
45+
46+
47+
def get_depth_summary(trees: list[ExportMessageTree]) -> dict:
48+
"""
49+
Get a summary of depth metrics across multiple conversation trees.
50+
51+
Args:
52+
trees: List of conversation trees to analyze
53+
54+
Returns:
55+
Dictionary with aggregated depth statistics
56+
"""
57+
analyses = []
58+
for tree in trees:
59+
analysis = analyze_conversation_depth(tree)
60+
if analysis:
61+
analyses.append(analysis)
62+
63+
if not analyses:
64+
return {
65+
"total_trees": 0,
66+
"avg_max_depth": 0.0,
67+
"avg_average_depth": 0.0,
68+
"total_messages": 0,
69+
}
70+
71+
return {
72+
"total_trees": len(analyses),
73+
"avg_max_depth": sum(a.max_depth for a in analyses) / len(analyses),
74+
"avg_average_depth": sum(a.average_depth for a in analyses) / len(analyses),
75+
"total_messages": sum(a.total_messages for a in analyses),
76+
}
77+

oasst-data/oasst_data/schemas.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,3 +93,11 @@ class ExportMessageTree(BaseModel):
9393
tree_state: Optional[str]
9494
prompt: Optional[ExportMessageNode]
9595
origin: Optional[str]
96+
97+
98+
class DepthAnalysis(BaseModel):
99+
"""Analysis results for conversation tree depth metrics."""
100+
101+
max_depth: int # Maximum depth of the conversation tree
102+
average_depth: float # Average depth across all leaf nodes
103+
total_messages: int # Total number of messages in the tree

oasst-data/oasst_data/traversal.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,39 @@ def visit_messages_depth_first(
3333
if node.replies:
3434
for c in node.replies:
3535
visit_messages_depth_first(node=c, visitor=visitor, predicate=predicate)
36+
37+
38+
def calculate_tree_depth(node: ExportMessageNode, current_depth: int = 0) -> dict:
39+
"""
40+
Calculate depth metrics for a conversation tree.
41+
42+
Args:
43+
node: The root node of the conversation tree
44+
current_depth: Current depth level (used for recursion)
45+
46+
Returns:
47+
Dictionary containing maximum_depth, leaf_depths, and total_messages
48+
"""
49+
if not node:
50+
return {"maximum_depth": 0.0, "leaf_depths": [], "total_messages": 0}
51+
52+
total = 1
53+
leaf_depths = []
54+
55+
if not node.replies or len(node.replies) == 0:
56+
# This is a leaf node
57+
leaf_depths.append(current_depth)
58+
else:
59+
# Recurse into children
60+
for reply in node.replies:
61+
child_result = calculate_tree_depth(reply, current_depth + 1)
62+
leaf_depths.extend(child_result["leaf_depths"])
63+
total += child_result["total_messages"]
64+
65+
max_depth = max(leaf_depths) if leaf_depths else current_depth
66+
67+
return {
68+
"maximum_depth": float(max_depth), # Returns as float
69+
"leaf_depths": leaf_depths,
70+
"total_messages": total,
71+
}

0 commit comments

Comments
 (0)