Den4ikAI
diff --git a/‎LICENSE‎
Lines changed: 13 additions & 0 deletions b/‎LICENSE‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 7 additions & 13 deletions b/‎README.md‎
Lines changed: 7 additions & 13 deletions
diff --git a/‎ruaccent/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎ruaccent/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎ruaccent/accent_model.py‎
Lines changed: 14 additions & 6 deletions b/‎ruaccent/accent_model.py‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎ruaccent/omograph_model.py‎
Lines changed: 55 additions & 11 deletions b/‎ruaccent/omograph_model.py‎
Lines changed: 55 additions & 11 deletions
@@ -0,0 +1,13 @@
+Copyright 2023 Denis Petrov
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
@@ -1,6 +1,8 @@
 # RUAccent
 
-RUAccent - это библиотека для автоматической расстановки ударений на русском языке. 
+RUAccent - это библиотека для автоматической расстановки ударений на русском языке.
+
+**Внимание!!! Смена лицензии на Apache 2.0**
 
 ## Установка
    С помощью pip
@@ -13,16 +15,14 @@ RUAccent - это библиотека для автоматической ра
    ```
 ## Параметры работы
 
-    load(omograph_model_size='big_poetry', use_dictionary=True, custom_dict={}, custom_homographs={})
-
+    load(omograph_model_size='big_poetry', use_dictionary=True, custom_dict={})
 
  - На данный момент доступно 6 моделей. **big** (рекомендуется к использованию), **medium** и **small**. Рекомендуются к использованию модели версии **poetry**. Их названия **big_poetry**, **medium_poetry**, **small_poetry**.
- - Модель **big** имеет 178 миллионов параметров, **medium** 85 миллионов, а **small** 42 миллиона
+ - Модель **big** имеет 178 миллионов параметров, **medium** 85 миллионов, а **small** 12 миллионов
  - Переменная **use_dictionary** отвечает за загрузку всего словаря (требуется больше ОЗУ), иначе все ударения расставляет нейросеть. 
- - Переменная **custom_homographs** отвечает за добавление своих омографов. Формат такой: `{'слово-омограф': ['вариант ударения 1', 'вариант ударения 2']}`. 
  - Функция **custom_dict** отвечает за добавление своих вариантов ударений в словарь. Формат такой: `{'слово': 'сл+ово с удар+ением'}`
 
-
+    **Для работы требуется 5 гигабайт ОЗУ**
 ## Пример использования
 ```python
 from ruaccent import RUAccent
@@ -37,10 +37,4 @@ text = 'ежик нашел в лесу ягоды.'
 print(accentizer.process_yo(text))
 ```
 
-## Датасеты
-
-- [Датасет](https://huggingface.co/datasets/TeraTTS/nkrja_raw) собранный с [НКРЯ](https://ruscorpora.ru/) (удален по просьбе разработчиков НКРЯ)
-- [Датасет](https://huggingface.co/datasets/TeraTTS/stress_dataset_sft_proza) использовавшийся для обучения моделей акцентуатора (версия только с прозой)
-- [Датасет](https://huggingface.co/datasets/TeraTTS/stress_dataset_sft_poetry) использовавшийся для обучения моделей акцентуатора (версия проза + поэзия)
-
-Файлы моделей и словарей располагаются по [ссылке](https://huggingface.co/TeraTTS/accentuator). Мы будем признательны фидбеку на [telegram аккаунт](https://t.me/chckdskeasfsd)
+Файлы моделей и словарей располагаются по [ссылке](https://huggingface.co/ruaccent/accentuator). Мы будем признательны фидбеку на [telegram аккаунт](https://t.me/chckdskeasfsd)
@@ -1,6 +1,6 @@
 """Russian accentizer"""
 
-__version__ = "1.5.4.1"
+__version__ = "1.5.5.2"
 
 
-from .ruaccent import RUAccent
+from .ruaccent import RUAccent
@@ -3,23 +3,26 @@
 from onnxruntime import InferenceSession
 from .char_tokenizer import CharTokenizer
 
+def softmax(x):
+    e_x = np.exp(x - np.max(x))
+    return e_x / e_x.sum(axis=-1, keepdims=True)
+
 class AccentModel:
     def __init__(self) -> None:
         pass
 
-    def load(self, path):
-        self.session = InferenceSession(f"{path}/model.onnx", providers=["CPUExecutionProvider"])
+    def load(self, path, device="CPU"):
+        self.session = InferenceSession(f"{path}/model.onnx", providers=["CUDAExecutionProvider" if device == "CUDA" else "CPUExecutionProvider"])
 
         with open(f"{path}/config.json", "r") as f:
             self.id2label = json.load(f)["id2label"]
         self.tokenizer = CharTokenizer.from_pretrained(path)
-        self.tokenizer.model_input_names = ["input_ids", "attention_mask"]
 
     def render_stress(self, text, pred):
         text = list(text)
         i = 0
         for chunk in pred:
-            if chunk != "NO":
+            if chunk['label'] != "NO" and chunk['label'] != "STRESS_SECONDARY" and chunk["score"] >= 0.55:
                 text[i - 1] = "+" + text[i - 1]
             i += 1
         text = "".join(text)
@@ -31,7 +34,12 @@ def put_accent(self, word):
         outputs = self.session.run(None, inputs)
         output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
         logits = outputs[output_names["logits"]]
+        probabilities = softmax(logits)
+        scores = np.max(probabilities, axis=-1)[0]
         labels = np.argmax(logits, axis=-1)[0]
-        labels = [self.id2label[str(label)] for label in labels]
-        stressed_word = self.render_stress(word, labels)
+        pred_with_scores = [{'label': self.id2label[str(label)], 'score': float(score)} 
+                            for label, score in zip(labels, scores)]
+
+        stressed_word = self.render_stress(word, pred_with_scores)
+
         return stressed_word
@@ -7,23 +7,67 @@ class OmographModel:
     def __init__(self):
         pass
 
-    def load(self, path):
+    def load(self, path, device="CPU"):
+        self.session = InferenceSession(f"{path}/model.onnx", providers=["CUDAExecutionProvider" if device == "CUDA" else "CPUExecutionProvider"])
         self.tokenizer = AutoTokenizer.from_pretrained(path)
-        self.session = InferenceSession(f"{path}/model.onnx", providers=['CPUExecutionProvider'])
-
+        
     def softmax(self, x):
         e_x = np.exp(x - np.max(x))
         return e_x / e_x.sum()
 
-    def classify(self, text, hypotheses):
+    def group_words(self, words):
+        groups = {}
+        for word in words:
+            parts = word.replace('+', '')
+            key = parts
+            groups.setdefault(key, []).append(word)
+        return list(groups.values())
+        
+    def transfer_grouping(self, grouped_list, target_list):
+        new_grouped_list = []
+        start_index = 0
+        for group in grouped_list:
+            group_length = len(group)
+            new_group = target_list[start_index:start_index + group_length]
+            new_grouped_list.append(new_group)
+            start_index += group_length
+        return new_grouped_list
+        
+    def classify(self, texts, hypotheses):
         hypotheses_probs = []
-        text = re.sub(r'\s+(?=(?:[,.?!:;…]))', r'', text)
-        for h in hypotheses:
-            inputs = self.tokenizer(text, h, return_tensors="np")
+        preprocessed_texts = [re.sub(r'\s+(?=(?:[,.?!:;…]))', r'', text) for text in texts]
+        if len(hypotheses) % 2 != 0:
+            outs = []
+            grouped_h = self.group_words(hypotheses)
+            grouped_t = self.transfer_grouping(grouped_h, preprocessed_texts)
+            for h, t in zip(grouped_h, grouped_t):
+                probs = []
+                for hp in h:
+                    inputs = self.tokenizer(t[0], hp, max_length=512, truncation=True, return_tensors="np")
+                    inputs = {k: v.astype(np.int64) for k, v in inputs.items()}
+                    outputs = self.session.run(None, inputs)[0]
+                    outputs = self.softmax(outputs)
+                    prob_label_is_true = [float(p[1]) for p in outputs][0]
+                    probs.append(prob_label_is_true)
+                    #print(h, prob_label_is_true)
+                outs.append(h[probs.index(max(probs))])
+            return outs
+        else:
+            inputs = self.tokenizer(preprocessed_texts, hypotheses, return_tensors="np", padding=True, truncation=True, max_length=512)
             inputs = {k: v.astype(np.int64) for k, v in inputs.items()}
-
+    
             outputs = self.session.run(None, inputs)[0]
             outputs = self.softmax(outputs)
-            prob_label_is_true = [float(p[1]) for p in outputs][0]
-            hypotheses_probs.append(prob_label_is_true)
-        return hypotheses[hypotheses_probs.index(max(hypotheses_probs))]
+            #print(hypotheses)
+            preprocessed_texts = [(preprocessed_texts[i], preprocessed_texts[i+1]) for i in range(0, len(preprocessed_texts), 2)]
+            hypotheses =  [(hypotheses[i], hypotheses[i+1]) for i in range(0, len(hypotheses), 2)]
+            
+            for i in range(len(texts)):
+                prob_label_is_true = float(outputs[i][1])
+                hypotheses_probs.append(prob_label_is_true)
+    
+            hypotheses_probs = [(hypotheses_probs[i], hypotheses_probs[i+1]) for i in range(0, len(hypotheses_probs), 2)]
+            outs = []
+            for pair1, pair2 in zip(hypotheses, hypotheses_probs):
+              outs.append(pair1[pair2.index(max(pair2))])
+            return outs