scraper-and-summarizer/summarizer.py at main · sebsuenkler/scraper-and-summarizer · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import os
from openai import OpenAI
from dotenv import load_dotenv

def summarize_text(text):
    """
    Main function that summarizes a text using AI.

    This function orchestrates the summarization process:
    1. Detects the primary language of the text
    2. Creates a summary in the detected language
    3. Identifies the main topic/category of the content
    4. Returns the category and summary together

    Args:
        text: The text to be summarized

    Returns:
        A string containing the main category followed by the summary
    """

    def init_client():
        """
        Initializes and returns the OpenAI client for Nebius API.

        This function:
        1. Loads environment variables from .env file
        2. Sets up the API client with the appropriate base URL and API key

        Returns:
            An initialized OpenAI client configured for Nebius API
        """
        load_dotenv()  # Load API key from .env file
        client = OpenAI(
            base_url="https://api.studio.nebius.com/v1/",  # Nebius API endpoint
            api_key=os.environ.get("NEBIUS_API_KEY")       # Get API key from environment variables
        )
        return client

    def get_response(text, tokens):
        """
        Sends a prompt to the AI model and retrieves the response.

        This function:
        1. Initializes the API client
        2. Sends the prompt to the Mixtral model
        3. Configures response parameters (temperature, tokens, etc.)
        4. Extracts and returns the content from the response

        Args:
            text: The prompt text to send to the AI
            tokens: Maximum number of tokens for the response

        Returns:
            The text response from the AI model
        """
        client = init_client()
        response = client.chat.completions.create(
            model="microsoft/phi-4",  # Using Mixtral 8x7B model
            max_tokens=tokens,                           # Maximum length of response
            temperature=0.2,                             # Lower creativity (more focused)
            top_p=0.85,                                  # Restricted word variety
            presence_penalty=0.21,                       # Avoid repetition
            extra_body={
                "top_k": 20                              # Consider only top 20 token options
            },
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": text
                        }
                    ]
                }
            ]
        )

        # Extract the actual text content from the response
        content = response.choices[0].message.content.strip()
        return content

    def detect_language(text):
        """
        Determines the primary language of the provided text.

        This function uses the AI to analyze word frequency
        and identify the dominant language in the text, even if
        multiple languages are present.

        Args:
            text: The text to analyze for language

        Returns:
            A string containing the detected language
        """
        prompt = (
            f"Recognize the main language of this text disregard the content, topic, or subject matter completely. Use a frequency analysis of the words. Always choose the language with the most words, regardless of whether there are other languages in the text. Do not make assumptions based on mentioned people, places, or organizations.\n\n"
            f"At the end, **only enter the language as a single word** in the following form. No explanation, no introduction, no justification - just the keyword:\n\n"
            f"Language: <language>"
            f"Text:\n{text}\n\n"
        )
        language = get_response(prompt, 100).strip()
        return language

    def create_summary(text, language):
        """
        Generates a concise summary of the provided text in the specified language.

        This function creates a well-structured summary with multiple paragraphs
        for better readability. It specifically requests the AI to avoid
        unnecessary explanations or introductions.

        Args:
            text: The text to summarize
            language: The language to use for the summary

        Returns:
            A string containing the formatted summary
        """
        prompt = (
            f"Read the following text and summarize its content briefly and precisely in **{language}** as continuous text. Try to limit the summary to a maximum of 300 words if appropriate."
            f"Return **only** the summary in {language} - without introduction, without original text, without explanation, without a translation:\n\n" \
            f"{text}\n\n" \
            f"Divide the summary into paragraphs and separate each paragraph with a blank line (\n\n) to improve readability."
        )
        summary = get_response(prompt, 1500).strip()  # Allow up to 1500 tokens for detailed summary
        return summary


    # Execute the summarization workflow
    language = detect_language(text)
    summary = create_summary(text, language)

    # Return the summary
    return summary