AAT-CLIP-embeddings/extract-data.py at main · lklic/AAT-CLIP-embeddings · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
from lxml import etree
import csv

def save_to_csv(data, file_name):
    with open(file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Subject ID", "Combined Text"])
        for item in data:
            writer.writerow(item)

def extract_data(xml_path):
    # Parse the XML with explicit namespace definition
    with open(xml_path, 'r', encoding='utf-8') as file:
        xml_content = file.read()
        # Remove the XML declaration if present
        xml_content = xml_content.replace('<?xml version="1.0" encoding="UTF-8" ?>', '')

        # Add the xsi namespace if missing
        xml_content = xml_content.replace('xsi:noNamespaceSchemaLocation', 'xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation')

    tree = etree.fromstring(xml_content.encode('utf-8'))  # Encoding the content to bytes
    root = tree

    nsmap = root.nsmap if root.nsmap else {}

    terms_data = []

    for subject in root.findall('.//Subject', namespaces=nsmap):
        subject_id = subject.get('Subject_ID')
        preferred_term = subject.find('Terms/Preferred_Term/Term_Text').text if subject.find('Terms/Preferred_Term/Term_Text') is not None else ""
        hierarchy = subject.find('Parent_Relationships/Preferred_Parent/Parent_String').text if subject.find('Parent_Relationships/Preferred_Parent/Parent_String') is not None else ""
        descriptive_note = ""
        record_type = subject.find('Record_Type').text if subject.find('Record_Type') is not None else ""

        # Extract English descriptive note
        for note in subject.findall('Descriptive_Notes/Descriptive_Note'):
            note_language = note.find('Note_Language')
            if note_language is not None and note_language.text == "English":
                descriptive_note = note.find('Note_Text').text
                break

        # Concatenate fields for embedding

        # Here is an extended description. CLIP wants only 77 tokens per term so I am removing some extra data.
        # combined_text = f"{preferred_term}. Hierarchy: {hierarchy}. Note: {descriptive_note}. Type: {record_type}"
        combined_text = f"{preferred_term}. Description: {descriptive_note}"
        terms_data.append((subject_id, combined_text))

    return terms_data

aat_data = extract_data('AAT.xml')
save_to_csv(aat_data, 'aat_terms.csv')