|
22 | 22 | import numpy as np |
23 | 23 | import pandas as pd |
24 | 24 | from tqdm import tqdm |
| 25 | +import csv |
| 26 | +import re |
| 27 | +from abc import abstractmethod |
25 | 28 |
|
26 | 29 | from nlp_architect import LIBRARY_OUT |
27 | 30 | from nlp_architect.common.core_nlp_doc import CoreNLPDoc |
28 | 31 | from nlp_architect.models.absa.inference.data_types import ( |
29 | 32 | TermType, |
30 | 33 | SentimentDocEncoder, |
31 | 34 | SentimentDoc, |
| 35 | + SentimentSentence, |
32 | 36 | ) |
33 | 37 | from nlp_architect.models.absa.inference.inference import SentimentInference |
34 | 38 | from nlp_architect.models.absa.utils import load_opinion_lex |
|
40 | 44 | line_count, |
41 | 45 | ) |
42 | 46 |
|
43 | | -from .utils import Anonymiser, _ui_format |
44 | | - |
45 | 47 | SENTIMENT_OUT = LIBRARY_OUT / "absa_solution" |
46 | 48 |
|
47 | 49 |
|
| 50 | +class Anonymiser(object): |
| 51 | + """Abstract class for anonymiser algorithm, intended for privacy keeping.""" |
| 52 | + |
| 53 | + @abstractmethod |
| 54 | + def run(self, text): |
| 55 | + pass |
| 56 | + |
| 57 | + |
| 58 | +class TweetAnonymiser(Anonymiser): |
| 59 | + """Anonymiser for tweets which uses lexicon for simple string replacements.""" |
| 60 | + |
| 61 | + def __init__(self, lexicon_path): |
| 62 | + self.entity_dict = self._init_entity_dict(lexicon_path) |
| 63 | + |
| 64 | + @staticmethod |
| 65 | + def _init_entity_dict(lexicon_path): |
| 66 | + ret = {} |
| 67 | + with open(lexicon_path, encoding="utf-8") as f: |
| 68 | + for row in csv.reader(f): |
| 69 | + ret[row[0]] = [_ for _ in row[1:] if _] |
| 70 | + return ret |
| 71 | + |
| 72 | + def run(self, text): |
| 73 | + for anonymised, entities in self.entity_dict.items(): |
| 74 | + for entity in entities: |
| 75 | + text = re.sub(entity, anonymised, text, flags=re.IGNORECASE) |
| 76 | + text = " ".join( |
| 77 | + [ |
| 78 | + "@other_entity" |
| 79 | + if (word.startswith("@") and word[1:] not in self.entity_dict.keys()) |
| 80 | + else word |
| 81 | + for word in text.split() |
| 82 | + ] |
| 83 | + ) |
| 84 | + return text |
| 85 | + |
| 86 | + |
| 87 | +def _ui_format(sent: SentimentSentence, doc: SentimentDoc) -> str: |
| 88 | + """Get sentence as HTML with 4 classes: aspects, opinions, negations and intensifiers.""" |
| 89 | + text = doc.doc_text[sent.start : sent.end + 1] |
| 90 | + seen = set() |
| 91 | + for term in sorted([t for e in sent.events for t in e], key=lambda t: t.start)[::-1]: |
| 92 | + if term.start not in seen: |
| 93 | + seen.add(term.start) |
| 94 | + start = term.start - sent.start |
| 95 | + end = start + term.len |
| 96 | + label = term.type.value + "_" + term.polarity.value |
| 97 | + text = "".join( |
| 98 | + (text[:start], '<span class="', label, '">', text[start:end], "</span>", text[end:]) |
| 99 | + ) |
| 100 | + return text |
| 101 | + |
| 102 | + |
48 | 103 | class SentimentSolution(object): |
49 | 104 | """Main class for executing Sentiment Solution pipeline. |
50 | 105 |
|
|
0 commit comments