diff --git a/opencompass/configs/datasets/livecodebench/livecodebench_pro_gen.py b/opencompass/configs/datasets/livecodebench_pro/livecodebench_pro_gen.py
similarity index 76%
rename from opencompass/configs/datasets/livecodebench/livecodebench_pro_gen.py
rename to opencompass/configs/datasets/livecodebench_pro/livecodebench_pro_gen.py
index 4b4052abd..a8c192809 100644
--- a/opencompass/configs/datasets/livecodebench/livecodebench_pro_gen.py
+++ b/opencompass/configs/datasets/livecodebench_pro/livecodebench_pro_gen.py
@@ -3,6 +3,7 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.datasets import (
     LCBProDataset,
+    LCBProEvaluator,
 )
 
 lcb_pro_reader_cfg = dict(
@@ -29,7 +30,14 @@
 )
 
 lcb_pro_eval_cfg = dict(
-    evaluator=dict()
+    evaluator=dict(
+        type=LCBProEvaluator,
+        submit_url='http://lightcpverifier.ailab.ailab.ai/submit',
+        result_url='http://lightcpverifier.ailab.ailab.ai/result/{submission_id}',
+        timeout=10,
+        poll_interval=10,
+        max_retries=3,
+    )
 )
 
 lcb_pro_datasets = [
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
index b51a9172e..0ac0b4a7b 100644
--- a/opencompass/datasets/__init__.py
+++ b/opencompass/datasets/__init__.py
@@ -93,6 +93,7 @@
 from .lcsts import *  # noqa: F401, F403
 from .leval import *  # noqa: F401, F403
 from .livecodebench import *  # noqa: F401, F403
+from .livecodebench_pro import *  # noqa: F401, F403
 from .livemathbench import *  # noqa: F401, F403
 from .livereasonbench import *  # noqa: F401, F403
 from .livestembench import *  # noqa: F401, F403
diff --git a/opencompass/datasets/livecodebench_pro/__init__.py b/opencompass/datasets/livecodebench_pro/__init__.py
new file mode 100644
index 000000000..06e9c866a
--- /dev/null
+++ b/opencompass/datasets/livecodebench_pro/__init__.py
@@ -0,0 +1,4 @@
+from .livecodebench_pro import LCBProDataset  # noqa: F401, F403
+from .livecodebench_pro_evaluator import LCBProEvaluator  # noqa: F401, F403
+
+__all__ = ['LCBProDataset', 'LCBProEvaluator']
diff --git a/opencompass/datasets/livecodebench_pro/livecodebench_pro.py b/opencompass/datasets/livecodebench_pro/livecodebench_pro.py
new file mode 100644
index 000000000..bc519dc4d
--- /dev/null
+++ b/opencompass/datasets/livecodebench_pro/livecodebench_pro.py
@@ -0,0 +1,26 @@
+import json
+
+from datasets import Dataset
+
+from opencompass.utils import get_data_path  # noqa: F401, F403
+
+from ..base import BaseDataset
+
+
+class LCBProDataset(BaseDataset):
+
+    @staticmethod
+    def load(path, **kwargs):
+        path = get_data_path(path)
+        dataset_list = []
+        li = 0
+        with open(path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                data = json.loads(line)
+                dataset_list.append({
+                    'id_ddm': data['id_ddm'],
+                    'problem': data['dialogs'][0]['content']
+                })
+                li += 1
+        return Dataset.from_list(dataset_list)
diff --git a/opencompass/datasets/livecodebench_pro/livecodebench_pro_evaluator.py b/opencompass/datasets/livecodebench_pro/livecodebench_pro_evaluator.py
new file mode 100644
index 000000000..60b22a8c5
--- /dev/null
+++ b/opencompass/datasets/livecodebench_pro/livecodebench_pro_evaluator.py
@@ -0,0 +1,478 @@
+# flake8: noqa: E501
+
+import re
+import time
+from typing import Dict, List
+
+import requests
+from datasets import Dataset
+
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS
+
+
+def extract_longest_cpp_code(text):
+    """Extract C++ code from text.
+
+    First tries to find fenced code blocks with ```cpp.
+    If not found, looks for code containing main() function and #include.
+
+    Args:
+        text (str): Text containing C++ code
+
+    Returns:
+        str or None: Extracted C++ code or None if not found
+    """
+    # -------------------------------
+    # 1. First match all fenced code blocks starting with ```cpp at the beginning of a line
+    # -------------------------------
+    fenced_pattern = r'(?m)^```cpp\s*\n(.*?)\n```'
+    fenced_blocks = re.findall(fenced_pattern, text, flags=re.DOTALL)
+    if fenced_blocks:
+        # Search from the last one backwards, return the first block containing "#include"
+        for block in reversed(fenced_blocks):
+            if '#include' in block:
+                return block.strip()
+    # -------------------------------
+    # 2. If no suitable fenced code blocks are found, extract code blocks based on main occurrence position
+    #    Start from the last main and work backwards, only return if conditions are met
+    # -------------------------------
+    cleaned_text = text  # Keep original text unchanged
+    main_matches = list(re.finditer(r'int\s+main\s*\(', cleaned_text))
+    if main_matches:
+        # Traverse backwards from the last main
+        for main in reversed(main_matches):
+            main_start_pos = main.start()
+            main_end_pos = main.end()
+
+            # From main_end_pos, find the start of main's internal code block: the first left brace '{'
+            brace_start = cleaned_text.find('{', main_end_pos)
+            if brace_start == -1:
+                # Cannot find left brace, skip this main
+                continue
+            # Brace matching, find the corresponding right brace '}' until count returns to zero
+            brace_count = 0
+            idx = brace_start
+            text_len = len(cleaned_text)
+            while idx < text_len:
+                ch = cleaned_text[idx]
+                if ch == '{':
+                    brace_count += 1
+                elif ch == '}':
+                    brace_count -= 1
+                    if brace_count == 0:
+                        idx += 1  # Include the closing brace
+                        break
+                idx += 1
+            func_end = idx  # End position of main function block
+            # Split text by lines and record the start index of each line in the original text (assuming newline length is 1)
+            lines = cleaned_text.splitlines()
+            line_start_indices = []
+            curr_idx = 0
+            for line in lines:
+                line_start_indices.append(curr_idx)
+                curr_idx += len(line) + 1
+
+            # Locate the line where main appears
+            main_line_index = None
+            for i, start in enumerate(line_start_indices):
+                if start <= main_start_pos < (start + len(lines[i]) + 1):
+                    main_line_index = i
+                    break
+            if main_line_index is None:
+                main_line_index = 0
+
+            # Scan upwards for consecutive "#include" lines (including consecutive #include lines above the main line)
+            include_line_index = None
+            for i in range(main_line_index, -1, -1):
+                if re.match(r'^\s*#include', lines[i]):
+                    include_line_index = i
+                else:
+                    # Once a non-#include line is encountered and #include lines have been found before, stop scanning
+                    if include_line_index is not None:
+                        break
+
+            candidate_start = (line_start_indices[include_line_index]
+                               if include_line_index is not None else
+                               line_start_indices[main_line_index])
+
+            candidate_code = cleaned_text[candidate_start:func_end].strip()
+            if '#include' in candidate_code:
+                return candidate_code
+
+    return None
+
+
+def extract_longest_python_code(text):
+    """Extract Python code from text.
+
+    First tries to find fenced code blocks with ```python.
+    If not found, looks for function/class definitions or import statements.
+
+    Args:
+        text (str): Text containing Python code
+
+    Returns:
+        str or None: Extracted Python code or None if not found
+    """
+    # -------------------------------
+    # 1. First match all fenced code blocks starting with ```python
+    # -------------------------------
+    fenced_pattern = r'(?m)^```python\s*\n(.*?)\n```'
+    fenced_blocks = re.findall(fenced_pattern, text, flags=re.DOTALL)
+
+    if fenced_blocks:
+        # Return the longest Python code block
+        longest_block = max(fenced_blocks, key=len)
+        return longest_block.strip()
+
+    # -------------------------------
+    # 2. If no fenced blocks, look for Python function/class definitions
+    # -------------------------------
+
+    # Pattern to match Python function/class definitions
+    def_class_pattern = r'(def\s+\w+|class\s+\w+)'
+    def_class_matches = list(re.finditer(def_class_pattern, text))
+
+    if def_class_matches:
+        # Try to extract code blocks containing these definitions
+        code_blocks = []
+
+        for match in def_class_matches:
+            start_pos = match.start()
+
+            # Find the boundaries of this code block
+            # Look for the beginning (previous blank line or start of text)
+            block_start = 0
+            for i in range(start_pos, -1, -1):
+                if i == 0:
+                    block_start = i
+                    break
+                # Check if encountering blank line (two consecutive newlines)
+                elif i >= 1 and text[i - 1:i + 1] == '\n\n':
+                    block_start = i + 1  # Start after blank line
+                    break
+
+            # Look for the end (next blank line or end of text)
+            block_end = len(text)
+            for i in range(start_pos, len(text)):
+                if i == len(text) - 1:
+                    block_end = len(text)
+                    break
+                # Check if encountering blank line
+                elif i < len(text) - 1 and text[i:i + 2] == '\n\n':
+                    block_end = i + 1  # Include the first newline
+                    break
+
+            code_block = text[block_start:block_end].strip()
+            if code_block and ('def ' in code_block or 'class ' in code_block):
+                code_blocks.append(code_block)
+
+        if code_blocks:
+            # Return the longest code block containing a function/class definition
+            return max(code_blocks, key=len)
+
+    # -------------------------------
+    # 3. If no function/class definitions, look for import statements
+    # -------------------------------
+    import_pattern = r'(^import\s+\w+|^from\s+\w+\s+import)'
+    import_matches = list(re.finditer(import_pattern, text,
+                                      flags=re.MULTILINE))
+
+    if import_matches:
+        # Extract code around import statements
+        import_blocks = []
+
+        for match in import_matches:
+            start_pos = match.start()
+
+            # Find boundaries
+            block_start = 0
+            for i in range(start_pos, -1, -1):
+                if i == 0:
+                    block_start = i
+                    break
+                elif i >= 1 and text[i - 1:i + 1] == '\n\n':
+                    block_start = i + 1
+                    break
+
+            block_end = len(text)
+            for i in range(start_pos, len(text)):
+                if i == len(text) - 1:
+                    block_end = len(text)
+                    break
+                elif i < len(text) - 1 and text[i:i + 2] == '\n\n':
+                    block_end = i + 1
+                    break
+
+            import_block = text[block_start:block_end].strip()
+            if import_block and ('import ' in import_block
+                                 or 'from ' in import_block):
+                import_blocks.append(import_block)
+
+        if import_blocks:
+            return max(import_blocks, key=len)
+
+    return None
+
+
+@ICL_EVALUATORS.register_module()
+class LCBProEvaluator(BaseEvaluator):
+    """Evaluator for LiveCodeBench Pro dataset.
+
+    This evaluator extracts code from model outputs (Python or C++),
+    submits them to a remote evaluation service, and polls for results.
+
+    Args:
+        submit_url (str): URL for submitting code. Defaults to the LCB Pro service.
+        result_url (str): URL template for retrieving results. Defaults to the LCB Pro service.
+        timeout (int): Request timeout in seconds. Defaults to 10.
+        poll_interval (int): Interval between result polling in seconds. Defaults to 10.
+        max_retries (int): Maximum number of retries for failed requests. Defaults to 3.
+    """
+
+    def __init__(
+        self,
+        submit_url: str = 'http://lightcpverifier.ailab.ailab.ai/submit',
+        result_url:
+        str = 'http://lightcpverifier.ailab.ailab.ai/result/{submission_id}',
+        timeout: int = 10,
+        poll_interval: int = 10,
+        max_retries: int = 3,
+    ) -> None:
+        """Initialize the LCBProEvaluator."""
+        self.submit_url = submit_url
+        self.result_url = result_url
+        self.timeout = timeout
+        self.poll_interval = poll_interval
+        self.max_retries = max_retries
+        super().__init__()
+
+    def _submit_code(self, pid: str, lang: str, code: str) -> int:
+        """Submit code to the evaluation service.
+
+        Args:
+            pid (str): Problem ID
+            lang (str): Programming language ('python' or 'cpp')
+            code (str): Code to evaluate
+
+        Returns:
+            int: Submission ID
+
+        Raises:
+            Exception: If submission fails after retries
+        """
+        payload = {
+            'pid': pid,
+            'lang': lang,
+            'code': code,
+        }
+        no_proxy = {'http': None, 'https': None}
+
+        for attempt in range(self.max_retries):
+            try:
+                response = requests.post(self.submit_url,
+                                         json=payload,
+                                         timeout=self.timeout,
+                                         proxies=no_proxy)
+                response.raise_for_status()
+                return response.json()['sid']
+            except Exception as e:
+                if attempt == self.max_retries - 1:
+                    raise Exception(
+                        f'Failed to submit code after {self.max_retries} attempts: {e}'
+                    )
+                time.sleep(1)
+
+        raise Exception('Should not reach here')
+
+    def _get_result(self, submission_id: int) -> str:
+        """Get evaluation result for a submission.
+
+        Args:
+            submission_id (int): Submission ID
+
+        Returns:
+            str: Result status ('Judging', 'Accepted', 'Judge Failed', etc.)
+        """
+        url = self.result_url.format(submission_id=submission_id)
+        no_proxy = {'http': None, 'https': None}
+
+        try:
+            response = requests.get(url,
+                                    proxies=no_proxy,
+                                    timeout=self.timeout)
+            if response.status_code == 404:
+                return 'Judging'
+            response.raise_for_status()
+            info = response.json()
+            status = info.get('status', '')
+            if status in ('queued', 'running', 'pending'):
+                return 'Judging'
+            if status == 'error':
+                return 'Judge Failed'
+            return info.get('result', 'Unknown')
+        except Exception as e:
+            return f'Error: {e}'
+
+    def _extract_code(self, text: str) -> tuple:
+        """Extract code from model output.
+
+        Tries to extract C++ code first, then Python code.
+
+        Args:
+            text (str): Model output text
+
+        Returns:
+            tuple: (code, language) or (None, None) if no code found
+        """
+        # Try C++ first
+        if re.search(r'```cpp', text):
+            code = extract_longest_cpp_code(text)
+            if code is not None:
+                return code, 'cpp'
+
+        # Try Python
+        if re.search(r'```python', text):
+            code = extract_longest_python_code(text)
+            if code is not None:
+                return code, 'python'
+
+        # If no language marker, try both extractors
+        cpp_code = extract_longest_cpp_code(text)
+        if cpp_code is not None:
+            return cpp_code, 'cpp'
+
+        python_code = extract_longest_python_code(text)
+        if python_code is not None:
+            return python_code, 'python'
+
+        return None, None
+
+    def score(self, predictions: List, references: List,
+              test_set: Dataset) -> Dict:
+        """Score code generation predictions against references.
+
+        Args:
+            predictions (list): List of model-generated code completions.
+            references (list): List of reference problem IDs.
+            test_set (Dataset): Dataset containing problem information.
+
+        Returns:
+            dict: Evaluation results including:
+                - accuracy: Percentage of correctly solved problems
+                - details: Detailed results for each test case
+                - error: Error message if evaluation failed
+        """
+        if len(predictions) != len(references):
+            return {
+                'error':
+                'predictions and references have different '
+                f'length. len(predictions): {len(predictions)}, '
+                f'len(references): {len(references)}'
+            }
+
+        # Convert dataset to pandas for easier manipulation
+        test_set = test_set.to_pandas()
+
+        # Track submissions
+        submissions = []
+        details = []
+
+        # Step 1: Extract code and submit
+        from tqdm import tqdm
+        print('Submitting code to evaluation service...')
+        for i in tqdm(range(len(predictions))):
+            prediction = predictions[i]
+            problem_id = references[i]
+
+            # Extract code from prediction
+            code, lang = self._extract_code(prediction)
+
+            if code is None:
+                # No code found
+                submissions.append({
+                    'index': i,
+                    'problem_id': problem_id,
+                    'sid': None,
+                    'code': None,
+                    'lang': None,
+                    'error': 'No code extracted'
+                })
+            else:
+                try:
+                    # Submit code
+                    sid = self._submit_code(problem_id, lang, code)
+                    submissions.append({
+                        'index': i,
+                        'problem_id': problem_id,
+                        'sid': sid,
+                        'code': code,
+                        'lang': lang,
+                        'error': None
+                    })
+                except Exception as e:
+                    submissions.append({
+                        'index': i,
+                        'problem_id': problem_id,
+                        'sid': None,
+                        'code': code,
+                        'lang': lang,
+                        'error': str(e)
+                    })
+
+        # Step 2: Poll for results
+        print('Polling for evaluation results...')
+        total_count = len(submissions)
+        accepted_count = 0
+
+        for submission in tqdm(submissions):
+            if submission['sid'] is None:
+                # Submission failed
+                details.append({
+                    'problem_id':
+                    submission['problem_id'],
+                    'correct':
+                    False,
+                    'result':
+                    submission.get('error', 'Unknown error'),
+                    'code':
+                    submission.get('code'),
+                    'lang':
+                    submission.get('lang'),
+                })
+                continue
+
+            # Poll for result
+            sid = submission['sid']
+            while True:
+                result = self._get_result(sid)
+                if result != 'Judging':
+                    if 'Accepted' in result:
+                        accepted_count += 1
+                        details.append({
+                            'problem_id': submission['problem_id'],
+                            'correct': True,
+                            'result': result,
+                            'code': submission['code'],
+                            'lang': submission['lang'],
+                        })
+                    else:
+                        details.append({
+                            'problem_id': submission['problem_id'],
+                            'correct': False,
+                            'result': result,
+                            'code': submission['code'],
+                            'lang': submission['lang'],
+                        })
+                    break
+                time.sleep(self.poll_interval)
+
+        # Calculate accuracy
+        accuracy = 100 * accepted_count / total_count if total_count > 0 else 0
+
+        return {
+            'accuracy': accuracy,
+            'pass@1': accuracy,  # Alias for consistency with other evaluators
+            'details': details,
+        }