go_outdoors_learning/outreach_plot_generator.py at main · marshallyale/go_outdoors_learning · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""This script is used for Go Outdoors to take a csv file, parse it, and then create plots showing the children's progress"""
import argparse
import sys
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import regex as re


def get_topic_df_from_csv(csv_file):
    """Create the dictionary of dataframes from a csv file.

    Args:
        csv_file (str): String pointing to either relative or absolute path of excel file

    Returns:
        Any: Dictionary of dataframes which correspond to sheets in an excel file
    """
    topics = []
    df_dict = {}
    topic_regex = re.compile(r"(?<=How much did you know about )(.*)(?= BEFORE)")
    after_regex = re.compile(r"(?<=How much did you know about )(.*)(?= AFTER)")
    interest_regex = re.compile(r"(?<=How much did you LIKE )(.*)")
    df = pd.read_csv(csv_file)
    for column in df.columns:
        match = topic_regex.search(column)
        if match:
            topics.append(match.group(1))
    for topic in topics:
        filtered_columns = df.filter(regex=topic).columns
        # Select the columns you want to keep
        df_dict[topic] = df[filtered_columns]
    # Verify that the needed columns are in the dataframe
    for topic, topic_df in df_dict.items():
        columns = "-".join(topic_df.columns)
        after_match = after_regex.search(columns)
        interest_match = interest_regex.search(columns)
        if not after_match:
            print(f"The topic {topic} is missing the AFTER question. Exiting.")
            sys.exit(1)
        if not interest_match:
            print(f"The topic {topic} is missing the INTEREST question. Exiting.")
            sys.exit(1)
    return df_dict


def create_plottable_df(df, topic, use_scale):
    """Takes an initial dataframe and returns a knowledge dataframe and an
    interest dataframe which are ready to be plotted

    Args:
        df (pandas dataframe): Dataframe which is ready to be plotted

    Returns:
        pandas_dataframe: Dataframe which corresponds to the student's interest in a particular subject
        pandas_dataframe: Dataframe which corresponds to the student's before and after knowledge of a subject
        bool: True means it just uses the default scale as defined in Python. False tries to find scale from csv
    """
    # Need to pull out the correct scale
    if use_scale:
        learn_scale = {1: "Nothing", 2: "A little", 3: "Some", 4: "A lot"}
        interest_scale = {1: "Dislike", 2: "Like", 3: "Love"}
    else:
        learn_scale_match = re.search(r"(?<=lesson\?\n)((.|\n)*)", df.columns[0])
        if learn_scale_match:
            learn_scale = {}
            scale_group = learn_scale_match.group(1)
            scale_group = scale_group.split("\n")
            for line in scale_group:
                line = line.split(" = ")
                learn_scale[int(line[0])] = line[1]
        else:
            print(
                "Couldn't extract scale from csv. Assuming 1-4 for learning, 1-3 for interest"
            )
            learn_scale = {1: "Nothing", 2: "A little", 3: "Some", 4: "A lot"}
            interest_scale = {1: "Dislike", 2: "Like", 3: "Love"}
    # Need to now rename columns so they aren't so damn long
    # df.columns = scale.values()
    # # Setting column names
    column_rename_map = {}
    for column in df.columns:
        if "BEFORE" in column:
            column_rename_map[column] = "before"
        elif "AFTER" in column:
            column_rename_map[column] = "after"
        elif "LIKE" in column:
            column_rename_map[column] = "like"
        else:
            print(f"Column not known. {column}")
    df = df.rename(columns=column_rename_map)
    # Pulling out student votes for knowledge level from xlsx
    knowledge_before = df["before"].value_counts()
    knowledge_after = df["after"].value_counts()
    interest = df["like"].value_counts()
    # Have to concatenate both series into a dataframe, reset the indexes, and then transpose

    df_knowledge = pd.concat(
        [knowledge_before, knowledge_after], axis=1, keys=["Before", "After"]
    )
    df_knowledge = df_knowledge.sort_index(ascending=True)
    df_knowledge = df_knowledge.T
    knowledge_columns = list(
        df_knowledge.columns
    )  # Grab final column names (don't hardcode in case they change)
    df_knowledge[knowledge_columns] = df_knowledge[knowledge_columns].apply(
        lambda x: (x / x.sum()) * 100, axis=1
    )  # Convert everything to percentage
    df_knowledge = df_knowledge.rename(columns=learn_scale)
    df_knowledge.columns.name = topic
    df_int = pd.DataFrame(interest)
    interest_column = ["Interest"]
    df_int.columns = interest_column
    df_int[interest_column] = df_int[interest_column].apply(
        lambda x: (x / x.sum()) * 100, axis=0
    )  # Convert everything to percentage
    df_int.index.name = None  # Have to remove this or else it will display on the plots
    df_int = df_int.sort_index(ascending=True)
    df_int = df_int.rename(index=interest_scale)
    return df_knowledge, df_int


def main():
    """Main function"""
    csv_file = args.csv_file
    df_dict = get_topic_df_from_csv(csv_file)
    plot_dfs = []
    plot_interest = []
    for topic, df in df_dict.items():
        print(f"Generating plots for topic {topic}")
        plot_df, interest = create_plottable_df(df, topic, args.use_scale)
        plot_dfs.append(plot_df)
        plot_interest.append(interest)

    for i, plot_df in enumerate(plot_dfs):
        fig = plt.figure()
        topic = plot_df.columns.name
        subplot_title_x_loc = (
            0.5  # Change this to change the location of the title (i.e. The topics)
        )
        subplot_title_y_loc = 1.05
        fig.suptitle(topic, x=subplot_title_x_loc, y=subplot_title_y_loc, fontsize=14)
        axes = fig.subplots(nrows=1, ncols=2)

        ax1 = plot_df.plot(
            kind="bar",
            stacked=True,
            ax=axes[1],
            # color=["#2A788EFF", "#7AD151FF", "#FDE725FF", "#440154FF"],  # ,
            color=["#fd9c5a", "#ffceaa", "#a0cbeb", "#4099d7"],
            title="Knowledge Gains",
        )
        ax1.legend(loc="lower left", bbox_to_anchor=(1, 0), title="Knowledge Level")
        # Reversing the legend
        handles, labels = ax1.get_legend_handles_labels()
        # Reverse the order of handles and labels
        handles = handles[::-1]
        labels = labels[::-1]
        # Create a new legend with the reversed order
        ax1.legend(
            handles,
            labels,
            loc="lower left",
            bbox_to_anchor=(1, 0),
            title="Knowledge Level",
        )
        ax1.yaxis.set_major_formatter(mtick.PercentFormatter())

        ax2 = plot_interest[i].plot(
            kind="bar",
            ylabel="Votes",
            legend=False,
            ax=axes[0],
            title="Interest",
            color="#989898",
        )
        ax2.yaxis.set_major_formatter(mtick.PercentFormatter())
        fig.tight_layout()
        title = f"plots/{topic}.pdf"
        # fig.savefig(title, dpi=600, bbox_inches="tight")
        fig.savefig(title, format="pdf", dpi=600, bbox_inches="tight")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Creates plots from excel file for Go Outdoors"
    )
    parser.add_argument(
        "-f",
        "--file",
        type=str,
        dest="csv_file",
        help="csv file to create plots from",
        required=True,
    )
    parser.add_argument(
        "-s",
        "--scale",
        dest="use_scale",
        help="Add this flag to use the predefined scale in the program vs reading the scale from the csv",
        action="store_true",
    )
    args = parser.parse_args()
    main()