study-framework-wiki/uploads/a97f6c3b8caebd3c21b9ae6c903a6314/ReconstructPhaseTablesFromParticipantLogs.py at master · JohnboyJovi/study-framework-wiki · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# This script can reconstruct phase tables with missing data from the participant logs
# Prerequsites:
#  - The headers of the phase tables need to be valid (apart from those the files can be empty)
#  - the IndependentVariables.csv needs to be complete (we just assume this since it was the case for us, otherwise also restore that!)
#  - no underscore (_) in the Phase or multi trial variable names

import os
from contextlib import chdir

FolderToLogs = "StudyLogs/ParticipantLogs"
FolderToPhaseTables = "StudyLogs"

IndependentVars = {}

def LoadIndependentVars():
    global IndependentVars
    ConvertCoding(os.path.join(FolderToPhaseTables, "IndependentVariables.csv"))
    with open(os.path.join(FolderToPhaseTables, "IndependentVariables.csv"), 'r') as file:
        header_entries = []
        for line in file:
            if len(header_entries) == 0:
                header_entries = line.strip().split(",")
                continue
            entries = line.strip().split(",")
            data = {}
            for i in range(0, len(entries)):
                data[header_entries[i]] = entries[i]
            IndependentVars[entries[0]] = data
    #print(IndependentVars)

def CreateEntryLine(header_entries, data):
    line = ""
    for entry in header_entries:
        if entry in data:
            line += data[entry]
        else:
            if not (entry == "lowerPrio" or entry == "higerPrio"):
                #some vars are not required (which we don't know here, so I added them for our case manually!)
                print("WARNING: Missing data for "+entry)
        if not entry == header_entries[-1]:
            line += ","
        else:
            line += "\n"
    return line

def CheckForSplitCommaInSentence(entries):
    entries_cleaned = []
    i=0
    while i < len(entries):
        if entries[i].startswith("\"") and not entries[i].endswith("\""):
            cleaned_entry = ""
            while not entries[i].endswith("\""):
                cleaned_entry += entries[i] + "[Komma]"
                i += 1
            cleaned_entry += entries[i]
            entries_cleaned.append(cleaned_entry)
        else:
            entries_cleaned.append(entries[i])
        i += 1
    return entries_cleaned

def RecoverDataTable(phase_filename, multi_trial):
    phase_name = phase_filename.replace("Phase_","").replace(".csv","")
    multi_trial_var_name = ""
    if multi_trial:
        phase_name = phase_filename.replace(".csv","").split("_")[1]
        multi_trial_var_name = phase_filename.replace(".csv","").split("_")[2]
    header_entries = []
    out_lines = []
    #read in header of this table
    with open(os.path.join(FolderToPhaseTables, phase_filename), 'r') as f:
        header_line = f.readline()
        header_entries = header_line.strip().split(",")
        out_lines.append(header_line)

    #now read through all participant logs and gather relevant information
    with chdir(FolderToLogs):
        for filename in sorted(filter(os.path.isfile, os.listdir(".")), key=os.path.getmtime):
            with open(filename, 'r') as file:
                ParticipantID = filename.split("-")[1].split("_")[0]
                reading_relevant_condition = False
                trial_nr = 0
                start_time = 0.0
                data = {}
                for line in file:
                    if "Start Condition:" in line and "Phase: "+phase_name+";" in line:
                        reading_relevant_condition = True
                        trial_nr = 0 #simply not used if not multi-trial var
                        start_time = float(line.strip().replace("#","").split(":")[0])
                        data = {"Phase":phase_name}
                        data.update(IndependentVars[ParticipantID]) # adds ParticipantId and IVs to dict
                        for factor_levels in line.split("(")[1].split(")")[0].split(";"):
                            if "Phase:" in factor_levels:
                                continue
                            factor, level = factor_levels[1:].split(": ")
                            data[factor] = level
                    if reading_relevant_condition and "EndCondition" in line:
                        reading_relevant_condition = False
                        if not multi_trial:
                            #in multi_trial case we store data not at the end but when it is recorded
                            data["Time"] = "{:.2f}".format(float(line.strip().replace("#","").split(":")[0]) - start_time)
                            out_lines.append(CreateEntryLine(header_entries, data))
                            #print(line)
                    if "Recorded" in line:
                        var_name = line.split(" ")[2][:-1] #last part removes ":" from the end
                        var_value = line.strip().split(" ",3)[3]
                        if multi_trial and var_name == multi_trial_var_name and reading_relevant_condition:
                            #this is a multi trial var we are looking for
                            var_entries = var_value.replace("{","").replace("}","").split(",")
                            #maybe we split at , in phrases which are escaped in ", so check that!
                            var_entries = CheckForSplitCommaInSentence(var_entries)
                            for i in range(0,len(var_entries)):
                                header_index = len(header_entries)-len(var_entries)+i
                                data[header_entries[header_index]] = var_entries[i]
                            data["Trial"] = str(trial_nr)
                            trial_nr += 1
                            out_lines.append(CreateEntryLine(header_entries, data))
                        if (not multi_trial) and var_name in header_entries:
                            data[var_name] = var_value
    #now write this
    if not os.path.exists(os.path.join(FolderToPhaseTables,"Recovered")):
        os.mkdir(os.path.join(FolderToPhaseTables,"Recovered"))
    with open(os.path.join(FolderToPhaseTables,"Recovered",phase_filename), 'w') as f:
        f.writelines(out_lines)


def ConvertCoding(full_filename):
    #remove all the byte order marks that Unreal puts in there
    with open(full_filename, mode='r', encoding='utf-8-sig') as file:
        lines = file.readlines()
        modified_lines = [line.lstrip('\ufeff') for line in lines]
    with open(full_filename, mode='w', encoding='utf-8') as file:
        file.writelines(modified_lines)


def Main():
    LoadIndependentVars()
    #go through all files in the phasetable folder
    for filename in os.listdir(FolderToPhaseTables):
        if filename.startswith("Phase") and filename.endswith(".csv"):
            #if not filename == "Phase_Decision_singlePlayDurationLeft.csv":
            #    continue
            full_name = os.path.join(FolderToPhaseTables, filename)
            ConvertCoding(full_name)
            print(full_name)
            #check whether this is a multiple trial data table or a normal phase
            with open(full_name, 'r') as f:
                header = f.readline()
                if ",Trial," in header:
                    #we expect this to be a multiple trial data table
                    RecoverDataTable(filename, True)
                else:
                    RecoverDataTable(filename, False)


Main()