feat(charts): add plot loss method

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 6af1cdaea10d · 2025-01-15T16:35:40.000+08:00
diff --git a/baselines/LongForm/longform.py b/baselines/LongForm/longform.py
@@ -3,15 +3,16 @@
 
 import os
 import json
-from dotenv import load_dotenv
+from dataclasses import dataclass
 import argparse
 import asyncio
+from typing import List
+from tqdm.asyncio import tqdm as tqdm_async
+from dotenv import load_dotenv
 
-from dataclasses import dataclass
 from models import OpenAIModel
-from typing import List
 from utils import create_event_loop, compute_content_hash
-from tqdm.asyncio import tqdm as tqdm_async
+
 
 PROMPT_TEMPLATE = '''Instruction: X
 Output:{doc}
@@ -28,7 +29,7 @@ def generate(self, docs: List[List[dict]]) -> List[dict]:
         loop = create_event_loop()
         return loop.run_until_complete(self.async_generate(docs))
 
-    async def async_generate(self, docs: List[List[dict]]) -> List[dict]:
+    async def async_generate(self, docs: List[List[dict]]) -> dict:
         final_results = {}
         semaphore = asyncio.Semaphore(self.max_concurrent)
 
@@ -51,7 +52,7 @@ async def process_chunk(content: str):
             try:
                 qa = await result
                 final_results.update(qa)
-            except Exception as e:
+            except Exception as e: # pylint: disable=broad-except
                 print(f"Error: {e}")
         return final_results
 
@@ -84,15 +85,15 @@ async def process_chunk(content: str):
     longform = LongForm(llm_client=llm_client)
 
     if args.data_type == 'raw':
-        with open(args.input_file, "r") as f:
+        with open(args.input_file, "r", encoding='utf-8') as f:
             data = [json.loads(line) for line in f]
             data = [[chunk] for chunk in data]
     elif args.data_type == 'chunked':
-        with open(args.input_file, "r") as f:
+        with open(args.input_file, "r", encoding='utf-8') as f:
             data = json.load(f)
 
     results = longform.generate(data)
 
     # Save results
-    with open(args.output_file, "w") as f:
+    with open(args.output_file, "w", encoding='utf-8') as f:
         json.dump(results, f, indent=4, ensure_ascii=False)
diff --git a/baselines/SELF-QA/self-qa.py b/baselines/SELF-QA/self-qa.py
@@ -2,14 +2,15 @@
 
 import os
 import json
-from dotenv import load_dotenv
+from dataclasses import dataclass
+from typing import List
 import argparse
 import asyncio
-from dataclasses import dataclass
+from tqdm.asyncio import tqdm as tqdm_async
+from dotenv import load_dotenv
+
 from models import OpenAIModel
-from typing import List
 from utils import create_event_loop, compute_content_hash
-from tqdm.asyncio import tqdm as tqdm_async
 
 INSTRUCTION_GENERATION_PROMPT = '''The background knowledge is:
 {doc}
@@ -49,6 +50,7 @@ def _post_process_answers(content: str) -> tuple:
         question = content.split('Question:')[1].split('Answer:')[0].strip()
         answer = content.split('Answer:')[1].strip()
         return question, answer
+    return None, None
 
 @dataclass
 class SelfQA:
@@ -59,7 +61,7 @@ def generate(self, docs: List[List[dict]]) -> List[dict]:
         loop = create_event_loop()
         return loop.run_until_complete(self.async_generate(docs))
 
-    async def async_generate(self, docs: List[List[dict]]) -> List[dict]:
+    async def async_generate(self, docs: List[List[dict]]) -> dict:
         final_results = {}
         semaphore = asyncio.Semaphore(self.max_concurrent)
 
@@ -71,20 +73,26 @@ async def process_chunk(content: str):
                     instruction_questions = _post_process_instructions(response)
 
                     qas = []
-                    for qa in tqdm_async(asyncio.as_completed([self.llm_client.generate_answer(READING_COMPREHENSION_PROMPT.format(doc=content, question=question)) for question in instruction_questions]), total=len(instruction_questions), desc="Generating QAs"):
+                    for qa in tqdm_async(asyncio.as_completed([
+                        self.llm_client.generate_answer(READING_COMPREHENSION_PROMPT.format(
+                            doc=content,
+                            question=question
+                        )) for question in instruction_questions]),
+                            total=len(instruction_questions), desc="Generating QAs"):
                         try:
                             question, answer = _post_process_answers(await qa)
-                            qas.append({
-                                compute_content_hash(question): {
-                                    'question': question,
-                                    'answer': answer
-                                }
-                            })
-                        except Exception as e:
+                            if question and answer:
+                                qas.append({
+                                    compute_content_hash(question): {
+                                        'question': question,
+                                        'answer': answer
+                                    }
+                                })
+                        except Exception as e: # pylint: disable=broad-except
                             print(f"Error: {e}")
                             continue
                     return qas
-                except Exception as e:
+                except Exception as e: # pylint: disable=broad-except
                     print(f"Error: {e}")
                     return []
 
@@ -98,7 +106,7 @@ async def process_chunk(content: str):
                 qas = await result
                 for qa in qas:
                     final_results.update(qa)
-            except Exception as e:
+            except Exception as e: # pylint: disable=broad-except
                 print(f"Error: {e}")
         return final_results
 
@@ -131,15 +139,15 @@ async def process_chunk(content: str):
     self_qa = SelfQA(llm_client=llm_client)
 
     if args.data_type == 'raw':
-        with open(args.input_file, "r") as f:
+        with open(args.input_file, "r", encoding='utf-8') as f:
             data = [json.loads(line) for line in f]
             data = [[chunk] for chunk in data]
     elif args.data_type == 'chunked':
-        with open(args.input_file, "r") as f:
+        with open(args.input_file, "r", encoding='utf-8') as f:
             data = json.load(f)
 
     results = self_qa.generate(data)
 
     # Save results
-    with open(args.output_file, "w") as f:
+    with open(args.output_file, "w", encoding='utf-8') as f:
         json.dump(results, f, indent=4, ensure_ascii=False)
diff --git a/charts/__init__.py b/charts/__init__.py
@@ -0,0 +1,2 @@
+from .plot_rephrase_process import plot_pre_length_distribution, plot_post_synth_length_distribution
+from .plot_loss_change import plot_loss_distribution
diff --git a/charts/plot_loss_change.py b/charts/plot_loss_change.py
@@ -1 +1,54 @@
-# 在训练前后的loss变化
+from collections import defaultdict
+import plotly.graph_objects as go
+
+def plot_loss_distribution(stats: list[dict]):
+    """
+    Plot the distribution of edges' loss.
+
+    :return fig
+    """
+
+    if not stats:
+        return go.Figure()
+
+    max_loss = max(item['average_loss'] for item in stats)
+    bin_numbers = 50
+    bin_size = max_loss / bin_numbers
+
+    length_distribution = defaultdict(int)
+
+    for item in stats:
+        bin_start = (item['average_loss'] // bin_size) * bin_size
+        bin_key = f"{bin_start}-{bin_start + bin_size}"
+        length_distribution[bin_key] += 1
+
+    sorted_bins = sorted(length_distribution.keys(),
+                         key=lambda x: float(x.split('-')[0]))
+
+    fig = go.Figure(data=[
+        go.Bar(
+            x=sorted_bins,
+            y=[length_distribution[bin_] for bin_ in sorted_bins],
+            text=[length_distribution[bin_] for bin_ in sorted_bins],
+            textposition='auto',
+        )
+    ])
+
+    fig.update_layout(
+        title='Distribution of Loss',
+        xaxis_title='Loss Range',
+        yaxis_title='Count',
+        bargap=0.2,
+        showlegend=False
+    )
+
+    if len(sorted_bins) > 10:
+        fig.update_layout(
+            xaxis={
+                'tickangle': 45,
+                'tickmode': 'array',
+                'ticktext': sorted_bins[::2],
+                'tickvals': list(range(len(sorted_bins)))[::2]
+            }
+        )
+    return fig
diff --git a/charts/plot_rephrase_process.py b/charts/plot_rephrase_process.py
@@ -1,6 +1,6 @@
 import re
-import plotly.express as px
 from collections import defaultdict
+import plotly.express as px
 import plotly.graph_objects as go
 import pandas as pd
 from tqdm import tqdm
diff --git a/generate.py b/generate.py
@@ -80,7 +80,7 @@
     graph_gen.traverse()
 
     config_path = os.path.join(sys_path, "cache", "configs", f"graphgen_{unique_id}.yaml")
-    if not os.path.exists(config_path):
-        os.makedirs(config_path)
+    if not os.path.exists(os.path.dirname(config_path)):
+        os.makedirs(os.path.dirname(config_path))
     with open(config_path, "w", encoding='utf-8') as f:
         yaml.dump(traverse_strategy.to_yaml(), f)
diff --git a/simulate.py b/simulate.py
@@ -5,11 +5,11 @@
 import json
 import gradio as gr
 
-from models import TraverseStrategy, NetworkXStorage
-from charts.plot_rephrase_process import plot_pre_length_distribution, plot_post_synth_length_distribution
+from models import TraverseStrategy, NetworkXStorage, Tokenizer
+from charts import plot_pre_length_distribution, plot_post_synth_length_distribution, plot_loss_distribution
 from graphgen.operators.split_graph import get_batches_with_strategy
 from utils import create_event_loop
-from models import Tokenizer
+
 
 if __name__ == "__main__":
     networkx_storage = NetworkXStorage(
@@ -153,4 +153,34 @@ def synthesize_text(file):
                 outputs=[output_plot]
             )
 
+        with gr.Tab("After Judgement"):
+            with gr.Row():
+                with gr.Column():
+                    file_list = os.listdir("cache/data/graphgen")
+                    input_file = gr.Dropdown(choices=file_list, label="Input File")
+                    file_button = gr.Button("Submit File")
+
+            with gr.Row():
+                output_plot = gr.Plot(label="Graph Visualization")
+
+            def judge_graph(file):
+                with open(f"cache/data/graphgen/{file}", "r", encoding='utf-8') as f:
+                    data = json.load(f)
+                stats = []
+                for key in data:
+                    item = data[key]
+                    item['average_loss'] = sum(loss[2] for loss in item['losses']) / len(item['losses'])
+                    stats.append({
+                        'average_loss': item['average_loss']
+                    })
+                fig = plot_loss_distribution(stats)
+                return fig
+
+            file_button.click(
+                fn=judge_graph,
+                inputs=[input_file],
+                outputs=[output_plot]
+            )
+
+
     app.launch()

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .plot_rephrase_process import plot_pre_length_distribution, plot_post_synth_length_distribution`
	`2`	`+from .plot_loss_change import plot_loss_distribution`