feat(charts): add graph traversal interface

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 2623b1501f87 · 2025-01-10T16:31:40.000+08:00
diff --git a/.pylintrc b/.pylintrc
@@ -52,18 +52,16 @@ ignore=CVS
 # ignore-list. The regex matches against paths and can be in Posix or Windows
 # format. Because '\\' represents the directory delimiter on Windows systems,
 # it can't be used as an escape character.
-ignore-paths=
 
 # Files or directories matching the regular expression patterns are skipped.
 # The regex matches against base names, not paths. The default value ignores
 # Emacs file locks
 ignore-patterns=^\.#
 
-# List of module names for which member attributes should not be checked and
-# will not be imported (useful for modules/projects where namespaces are
-# manipulated during runtime and thus existing member attributes cannot be
-# deduced by static analysis). It supports qualified module names, as well as
-# Unix pattern matching.
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis). It
+# supports qualified module names, as well as Unix pattern matching.
 ignored-modules=
 
 # Python code to execute, usually for sys.path manipulation such as
@@ -87,13 +85,9 @@ load-plugins=
 # Pickle collected data for later comparisons.
 persistent=yes
 
-# Resolve imports to .pyi stubs if available. May reduce no-member messages and
-# increase not-an-iterable messages.
-prefer-stubs=no
-
 # Minimum Python version to use for version dependent checks. Will default to
 # the version used to run pylint.
-py-version=3.10
+py-version=3.11
 
 # Discover python modules and packages in the file system subtree.
 recursive=no
@@ -307,9 +301,6 @@ max-locals=15
 # Maximum number of parents for a class (see R0901).
 max-parents=7
 
-# Maximum number of positional arguments for function / method.
-max-positional-arguments=5
-
 # Maximum number of public methods for a class (see R0904).
 max-public-methods=20
 
@@ -345,7 +336,7 @@ indent-after-paren=4
 indent-string='    '
 
 # Maximum number of characters on a single line.
-max-line-length=100
+max-line-length=120
 
 # Maximum number of lines in a module.
 max-module-lines=1000
@@ -438,7 +429,28 @@ disable=raw-checker-failed,
         deprecated-pragma,
         use-symbolic-message-instead,
         use-implicit-booleaness-not-comparison-to-string,
-        use-implicit-booleaness-not-comparison-to-zero
+        use-implicit-booleaness-not-comparison-to-zero,
+        missing-module-docstring,
+        missing-class-docstring,
+        missing-function-docstring,
+        W0122,  # Use of exec (exec-used)
+        R0914,  # Too many local variables (19/15) (too-many-locals)
+        R0903,  # Too few public methods (1/2)
+        W0613,  # Unused argument
+        W0511,  # TODO
+        W0719,  # Raising too general exception: Exception
+        R0801,  # Similar lines
+        W0105,  # String statement has no effect (pointless-string-statement)
+        R0913,  # Too many arguments (6/5) (too-many-arguments)
+        C0415,  # Import outside toplevel
+        R0902,  # Too many instance attributes (11/7)
+        R1725,  # Consider using Python 3 style super() without arguments (super-with-arguments)
+        W0622,  # Redefining built-in 'id' (redefined-builtin)
+        R0904,  # Too many public methods (27/20) (too-many-public-methods)
+        E1120,  # TODO: unbound-method-call-no-value-for-parameter
+        R0917,  # Too many positional arguments (6/5) (too-many-positional-arguments)
+        C0103,
+        E0401
 
 # Enable the message, report, category or checker with the given id(s). You can
 # either give multiple identifier separated by comma (,) or put this option
@@ -476,11 +488,6 @@ max-nested-blocks=5
 # printed.
 never-returning-functions=sys.exit,argparse.parse_error
 
-# Let 'consider-using-join' be raised when the separator to join on would be
-# non-empty (resulting in expected fixes of the type: ``"- " + " -
-# ".join(items)``)
-suggest-join-with-non-empty-separator=yes
-
 
 [REPORTS]
 
diff --git a/charts/plot_loss_change.py b/charts/plot_loss_change.py
@@ -0,0 +1 @@
+# 在训练前后的loss变化
diff --git a/charts/plot_rephrase_process.py b/charts/plot_rephrase_process.py
@@ -5,6 +5,8 @@
 from models import Tokenizer
 from utils.log import parse_log
 import plotly.express as px
+import plotly.graph_objects as go
+from collections import defaultdict
 
 def analyse_log(log_info: dict) -> list:
     """
@@ -71,7 +73,66 @@ async def plot_rephrase_process(stats: list[dict]):
     fig = px.scatter(df, x="pre_length", y="post_length", size="count", color="count", hover_name="count")
     fig.show()
 
+def plot_pre_length_distribution(stats: list[dict]):
+    """
+    Plot the distribution of pre-length.
+
+    :return fig
+    """
+
+    # 使用传入的stats参数而不是全局的data
+    if not stats:
+        return go.Figure()
+
+    # 计算最大长度并确定区间
+    max_length = max(item['pre_length'] for item in stats)
+    bin_size = 50
+    max_length = ((max_length // bin_size) + 1) * bin_size
+
+    # 使用defaultdict避免键不存在的检查
+    length_distribution = defaultdict(int)
+
+    # 一次遍历完成所有统计
+    for item in stats:
+        bin_start = (item['pre_length'] // bin_size) * bin_size
+        bin_key = f"{bin_start}-{bin_start + bin_size}"
+        length_distribution[bin_key] += 1
+
+    # 转换为排序后的列表以保持区间顺序
+    sorted_bins = sorted(length_distribution.keys(),
+                         key=lambda x: int(x.split('-')[0]))
+
+    # 创建图表
+    fig = go.Figure(data=[
+        go.Bar(
+            x=sorted_bins,
+            y=[length_distribution[bin_] for bin_ in sorted_bins],
+            text=[length_distribution[bin_] for bin_ in sorted_bins],
+            textposition='auto',
+        )
+    ])
+
+    # 设置图表布局
+    fig.update_layout(
+        title='Distribution of Pre-Length',
+        xaxis_title='Length Range',
+        yaxis_title='Count',
+        bargap=0.2,
+        showlegend=False
+    )
+
+    # 如果数据点过多，优化x轴标签显示
+    if len(sorted_bins) > 10:
+        fig.update_layout(
+            xaxis={
+                'tickangle': 45,
+                'tickmode': 'array',
+                'ticktext': sorted_bins[::2],  # 每隔一个显示标签
+                'tickvals': list(range(len(sorted_bins)))[::2]
+            }
+        )
 
+    return fig
 
 if __name__ == "__main__":
     log = parse_log('/home/PJLAB/chenzihong/Project/graphgen/cache/logs/graphgen.log')
diff --git a/evaluate.py b/evaluate.py
@@ -1,3 +1,5 @@
+"""Evaluate the quality of the generated text using various metrics"""
+
 import os
 import json
 import argparse
@@ -72,7 +74,8 @@ def clean_gpu_cache():
     parser.add_argument('--output', type=str, default='cache/output', help='path to save output')
 
     parser.add_argument('--tokenizer', type=str, default='cl100k_base', help='tokenizer name')
-    parser.add_argument('--reward', type=str, default='OpenAssistant/reward-model-deberta-v3-large-v2', help='Comma-separated list of reward models')
+    parser.add_argument('--reward', type=str, default='OpenAssistant/reward-model-deberta-v3-large-v2',
+                        help='Comma-separated list of reward models')
     parser.add_argument('--uni', type=str, default='MingZhong/unieval-sum', help='uni model name')
 
     args = parser.parse_args()
@@ -122,5 +125,4 @@ def clean_gpu_cache():
 
 
     results = pd.DataFrame(results)
-        
     results.to_csv(os.path.join(args.output, 'evaluation.csv'), index=False)
diff --git a/generate.py b/generate.py
@@ -1,9 +1,10 @@
 import os
 import json
 import argparse
+from dotenv import load_dotenv
+
 from graphgen.graphgen import GraphGen
 from models import OpenAIModel, Tokenizer, TraverseStrategy
-from dotenv import load_dotenv
 from utils import set_logger
 
 sys_path = os.path.abspath(os.path.dirname(__file__))
@@ -35,11 +36,13 @@
     input_file = args.input_file
 
     if args.data_type == 'raw':
-        with open(input_file, "r") as f:
+        with open(input_file, "r", encoding='utf-8') as f:
             data = [json.loads(line) for line in f]
     elif args.data_type == 'chunked':
-        with open(input_file, "r") as f:
+        with open(input_file, "r", encoding='utf-8') as f:
             data = json.load(f)
+    else:
+        raise ValueError(f"Invalid data type: {args.data_type}")
 
     teacher_llm_client = OpenAIModel(
         model_name=os.getenv("TEACHER_MODEL"),
diff --git a/simulate.py b/simulate.py
@@ -0,0 +1,125 @@
+"""Simulate text length distributions using input data distributions when rephrasing."""
+
+import gradio as gr
+
+from models import TraverseStrategy, NetworkXStorage
+from charts.plot_rephrase_process import plot_pre_length_distribution
+from graphgen.operators.split_graph import get_batches_with_strategy
+from utils import create_event_loop
+import copy
+
+if __name__ == "__main__":
+    networkx_storage = NetworkXStorage(
+        '/home/PJLAB/chenzihong/Project/graphgen/cache', namespace="graph"
+    )
+
+    async def get_batches(traverse_strategy: TraverseStrategy):
+        nodes = await networkx_storage.get_all_nodes()
+        edges = await networkx_storage.get_all_edges()
+
+        nodes = list(nodes)
+        edges = list(edges)
+
+        # deepcopy
+        nodes = [(node[0], node[1].copy()) for node in nodes]
+        edges = [(edge[0], edge[1], edge[2].copy()) for edge in edges]
+
+        nodes = copy.deepcopy(nodes)
+        edges = copy.deepcopy(edges)
+        assert all('length' in edge[2] for edge in edges)
+        assert all('length' in node[1] for node in nodes)
+
+        return await get_batches_with_strategy(nodes, edges, networkx_storage, traverse_strategy)
+
+    def traverse_graph(
+        bidirectional: bool,
+        expand_method: str,
+        max_extra_edges: int,
+        max_tokens: int,
+        max_depth: int,
+        edge_sampling: str,
+        isolated_node_strategy: str
+    ) -> str:
+        traverse_strategy = TraverseStrategy(
+            bidirectional=bidirectional,
+            expand_method=expand_method,
+            max_extra_edges=max_extra_edges,
+            max_tokens=max_tokens,
+            max_depth=max_depth,
+            edge_sampling=edge_sampling,
+            isolated_node_strategy=isolated_node_strategy
+        )
+
+        loop = create_event_loop()
+        batches = loop.run_until_complete(get_batches(traverse_strategy))
+        loop.close()
+
+        data = []
+        for _process_batch in batches:
+            pre_length = sum([node['length'] for node in _process_batch[0]]) + sum(
+                [edge[2]['length'] for edge in _process_batch[1]])
+            data.append({
+                'pre_length': pre_length
+            })
+        fig = plot_pre_length_distribution(data)
+
+        return fig
+
+
+    def update_sliders(expand_method):
+        if expand_method == "max_tokens":
+            return gr.update(visible=True), gr.update(visible=False)  # Show max_tokens, hide max_extra_edges
+        else:
+            return gr.update(visible=False), gr.update(visible=True)  # Hide max_tokens, show max_extra_edges
+
+
+    with gr.Blocks() as iface:
+        gr.Markdown("# Graph Traversal Interface")
+
+        with gr.Row():
+            with gr.Column():
+                bidirectional = gr.Checkbox(label="Bidirectional", value=False)
+                expand_method = gr.Dropdown(
+                    choices=["max_width", "max_tokens"],
+                    value="max_tokens",
+                    label="Expand Method",
+                    interactive=True
+                )
+
+                # Initialize sliders
+                max_extra_edges = gr.Slider(minimum=1, maximum=50, value=5, step=1, label="Max Extra Edges",
+                                            visible=False)
+                max_tokens = gr.Slider(minimum=128, maximum=8 * 1024, value=1024, step=128, label="Max Tokens")
+                max_depth = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Max Depth")
+                edge_sampling = gr.Dropdown(
+                    choices=["max_loss", "random", "min_loss"],
+                    value="max_loss",
+                    label="Edge Sampling Strategy"
+                )
+                isolated_node_strategy = gr.Dropdown(
+                    choices=["add", "ignore", "connect"],
+                    value="add",
+                    label="Isolated Node Strategy"
+                )
+                submit_btn = gr.Button("Traverse Graph")
+
+        with gr.Row():
+            output_plot = gr.Plot(label="Graph Visualization")
+
+        # Set up event listener for expand_method dropdown
+        expand_method.change(fn=update_sliders, inputs=expand_method, outputs=[max_tokens, max_extra_edges])
+
+        submit_btn.click(
+            fn=traverse_graph,
+            inputs=[
+                bidirectional,
+                expand_method,
+                max_extra_edges,
+                max_tokens,
+                max_depth,
+                edge_sampling,
+                isolated_node_strategy
+            ],
+            outputs=[output_plot]
+        )
+    iface.launch()