|
| 1 | +#include "wer.h" |
| 2 | + |
| 3 | +#include <cstdio> |
| 4 | +#include <vector> |
| 5 | +#include <string> |
| 6 | +#include <filesystem> |
| 7 | +#include <fstream> |
| 8 | +#include <cstring> |
| 9 | +#include <map> |
| 10 | + |
| 11 | +std::vector<std::string> read_files_from_directory(const std::string& dir_path) { |
| 12 | + std::vector<std::string> file_paths; |
| 13 | + try { |
| 14 | + for (const auto& entry : std::filesystem::directory_iterator(dir_path)) { |
| 15 | + if (entry.is_regular_file() && entry.path().extension() == ".txt") { |
| 16 | + file_paths.push_back(entry.path().string()); |
| 17 | + } |
| 18 | + } |
| 19 | + } catch (const std::filesystem::filesystem_error& e) { |
| 20 | + printf("Error reading directory %s: %s\n", dir_path.c_str(), e.what()); |
| 21 | + } |
| 22 | + return file_paths; |
| 23 | +} |
| 24 | + |
| 25 | +std::string read_file_content(const std::string& file_path) { |
| 26 | + std::ifstream file(file_path); |
| 27 | + std::string content; |
| 28 | + |
| 29 | + if (file.is_open()) { |
| 30 | + std::string line; |
| 31 | + while (std::getline(file, line)) { |
| 32 | + content += line + "\n"; |
| 33 | + } |
| 34 | + file.close(); |
| 35 | + } else { |
| 36 | + printf("Unable to open file: %s\n", file_path.c_str()); |
| 37 | + } |
| 38 | + |
| 39 | + return content; |
| 40 | +} |
| 41 | + |
| 42 | +std::string get_base_filename(const std::string& path) { |
| 43 | + return std::filesystem::path(path).filename().string(); |
| 44 | +} |
| 45 | + |
| 46 | +void print_usage(const char* program_name) { |
| 47 | + printf("Usage: %s [options]\n", program_name); |
| 48 | + printf("Options:\n"); |
| 49 | + printf(" -r, --reference PATH Full path to reference transcriptions directory\n"); |
| 50 | + printf(" -a, --actual PATH Full path to actual transcriptions directory\n"); |
| 51 | + printf(" --help Display this help message\n"); |
| 52 | +} |
| 53 | + |
| 54 | +int main(int argc, char** argv) { |
| 55 | + if (argc == 1) { |
| 56 | + print_usage(argv[0]); |
| 57 | + return 0; |
| 58 | + } |
| 59 | + |
| 60 | + std::string reference_path; |
| 61 | + std::string actual_path; |
| 62 | + bool reference_set = false; |
| 63 | + bool actual_set = false; |
| 64 | + |
| 65 | + for (int i = 1; i < argc; i++) { |
| 66 | + if (strcmp(argv[i], "--help") == 0) { |
| 67 | + print_usage(argv[0]); |
| 68 | + return 0; |
| 69 | + } else if (strcmp(argv[i], "-r") == 0 || strcmp(argv[i], "--reference") == 0) { |
| 70 | + if (i + 1 < argc) { |
| 71 | + reference_path = argv[++i]; |
| 72 | + reference_set = true; |
| 73 | + } else { |
| 74 | + printf("Error: Missing path after %s\n", argv[i]); |
| 75 | + print_usage(argv[0]); |
| 76 | + return 1; |
| 77 | + } |
| 78 | + } else if (strcmp(argv[i], "-a") == 0 || strcmp(argv[i], "--actual") == 0) { |
| 79 | + if (i + 1 < argc) { |
| 80 | + actual_path = argv[++i]; |
| 81 | + actual_set = true; |
| 82 | + } else { |
| 83 | + printf("Error: Missing path after %s\n", argv[i]); |
| 84 | + print_usage(argv[0]); |
| 85 | + return 1; |
| 86 | + } |
| 87 | + } else { |
| 88 | + printf("Error: Unknown option: %s\n", argv[i]); |
| 89 | + print_usage(argv[0]); |
| 90 | + return 1; |
| 91 | + } |
| 92 | + } |
| 93 | + |
| 94 | + if (!reference_set || !actual_set) { |
| 95 | + printf("Error: Both reference and actual paths must be provided\n"); |
| 96 | + print_usage(argv[0]); |
| 97 | + return 1; |
| 98 | + } |
| 99 | + |
| 100 | + if (!std::filesystem::exists(reference_path) || !std::filesystem::is_directory(reference_path)) { |
| 101 | + printf("Error: Reference path '%s' does not exist or is not a directory\n", reference_path.c_str()); |
| 102 | + return 1; |
| 103 | + } |
| 104 | + |
| 105 | + if (!std::filesystem::exists(actual_path) || !std::filesystem::is_directory(actual_path)) { |
| 106 | + printf("Error: Actual path '%s' does not exist or is not a directory\n", actual_path.c_str()); |
| 107 | + return 1; |
| 108 | + } |
| 109 | + |
| 110 | + std::vector<std::string> reference_files = read_files_from_directory(reference_path); |
| 111 | + std::vector<std::string> actual_files = read_files_from_directory(actual_path); |
| 112 | + |
| 113 | + //printf("Found %zu reference files in %s\n", reference_files.size(), reference_path.c_str()); |
| 114 | + //printf("Found %zu actual files in %s\n", actual_files.size(), actual_path.c_str()); |
| 115 | + |
| 116 | + std::map<std::string, std::string> reference_map; |
| 117 | + std::map<std::string, std::string> actual_map; |
| 118 | + |
| 119 | + for (const auto& file : reference_files) { |
| 120 | + reference_map[get_base_filename(file)] = file; |
| 121 | + } |
| 122 | + |
| 123 | + for (const auto& file : actual_files) { |
| 124 | + actual_map[get_base_filename(file)] = file; |
| 125 | + } |
| 126 | + |
| 127 | + for (const auto& [filename, ref_path] : reference_map) { |
| 128 | + auto actual_it = actual_map.find(filename); |
| 129 | + if (actual_it != actual_map.end()) { |
| 130 | + std::string reference_content = read_file_content(ref_path); |
| 131 | + std::string actual_content = read_file_content(actual_it->second); |
| 132 | + |
| 133 | + wer_result result = calculate_wer(reference_content, actual_content); |
| 134 | + printf("Word Error Rate for : %s\n", filename.c_str()); |
| 135 | + printf(" Reference words: %ld\n", result.n_ref_words); |
| 136 | + printf(" Actual words: %ld\n", result.n_act_words); |
| 137 | + printf(" Substitutions: %d\n", result.n_sub); |
| 138 | + printf(" Deletions: %d\n", result.n_del); |
| 139 | + printf(" Insertions: %d\n", result.n_ins); |
| 140 | + printf(" Total edits: %d\n", result.n_edits); |
| 141 | + printf(" WER: %f\n", result.wer); |
| 142 | + } else { |
| 143 | + printf("Warning: No matching actual file found for reference file: %s\n", filename.c_str()); |
| 144 | + } |
| 145 | + } |
| 146 | + |
| 147 | + return 0; |
| 148 | +} |
0 commit comments