|
| 1 | +(** BibTeX Parser and Pretty Printer |
| 2 | +
|
| 3 | + This module provides comprehensive functionality for parsing, manipulating, |
| 4 | + and formatting BibTeX bibliographic entries. It supports all standard BibTeX |
| 5 | + entry types and provides robust error handling for malformed input. *) |
| 6 | + |
| 7 | +(** Type representing different ways field values can be formatted in BibTeX *) |
| 8 | +type field_value = |
| 9 | + | QuotedStringValue of string (** Value enclosed in double quotes *) |
| 10 | + | BracedStringValue of string (** Value enclosed in curly braces *) |
| 11 | + | UnquotedStringValue of string (** Raw unquoted value *) |
| 12 | + | NumberValue of int (** Numeric value *) |
| 13 | + |
| 14 | +type field = { name : string; value : field_value } |
| 15 | +(** A BibTeX field with name and value *) |
| 16 | + |
| 17 | +(** Standard BibTeX entry types *) |
| 18 | +type entry_type = |
| 19 | + | Article (** Journal article *) |
| 20 | + | Book (** Book with explicit publisher *) |
| 21 | + | Booklet |
| 22 | + (** Work that is printed and bound, but without a named publisher *) |
| 23 | + | Conference (** Conference proceedings entry *) |
| 24 | + | InBook (** Part of a book (chapter, section, etc.) *) |
| 25 | + | InCollection (** Part of a book having its own title *) |
| 26 | + | InProceedings (** Article in conference proceedings *) |
| 27 | + | Manual (** Technical documentation *) |
| 28 | + | MastersThesis (** Master's thesis *) |
| 29 | + | Misc (** Miscellaneous entry type *) |
| 30 | + | PhdThesis (** PhD thesis *) |
| 31 | + | Proceedings (** Conference proceedings *) |
| 32 | + | TechReport (** Technical report *) |
| 33 | + | Unpublished |
| 34 | + (** Document having an author and title, but not formally published *) |
| 35 | + |
| 36 | +(** Content within a BibTeX entry *) |
| 37 | +type entry_content = |
| 38 | + | Field of field (** A field-value pair *) |
| 39 | + | EntryComment of string (** Comment within an entry *) |
| 40 | + |
| 41 | +type bibtex_entry = { |
| 42 | + entry_type : entry_type; (** Type of the entry *) |
| 43 | + citekey : string; (** Citation key/identifier *) |
| 44 | + contents : entry_content list; (** List of fields and comments *) |
| 45 | +} |
| 46 | +(** Complete BibTeX entry *) |
| 47 | + |
| 48 | +(** Top-level BibTeX item *) |
| 49 | +type bibtex_item = |
| 50 | + | Entry of bibtex_entry (** A bibliographic entry *) |
| 51 | + | Comment of string (** A comment line *) |
| 52 | + |
| 53 | +type parse_error = { line : int; position : int; message : string } |
| 54 | +(** Parse error information *) |
| 55 | + |
| 56 | +type parse_result = { items : bibtex_item list; errors : parse_error list } |
| 57 | +(** Result of parsing with potential errors *) |
| 58 | + |
| 59 | +(** {2 Parsing Functions} *) |
| 60 | + |
| 61 | +val parse_bibtex : string -> bibtex_item list |
| 62 | +(** [parse_bibtex input] parses a BibTeX string into a list of items. This |
| 63 | + function ignores parse errors and returns only successfully parsed items. |
| 64 | + @param input The BibTeX content as a string |
| 65 | + @return List of parsed BibTeX items *) |
| 66 | + |
| 67 | +val parse_bibtex_with_errors : string -> parse_result |
| 68 | +(** [parse_bibtex_with_errors input] parses a BibTeX string and returns both |
| 69 | + successfully parsed items and any errors encountered. |
| 70 | + @param input The BibTeX content as a string |
| 71 | + @return Parse result containing items and errors *) |
| 72 | + |
| 73 | +val has_parse_errors : parse_result -> bool |
| 74 | +(** [has_parse_errors result] checks if a parse result contains any errors. |
| 75 | + @param result The parse result to check |
| 76 | + @return true if there are errors, false otherwise *) |
| 77 | + |
| 78 | +val get_parse_errors : parse_result -> parse_error list |
| 79 | +(** [get_parse_errors result] extracts the list of parse errors. |
| 80 | + @param result The parse result |
| 81 | + @return List of parse errors *) |
| 82 | + |
| 83 | +val get_parsed_items : parse_result -> bibtex_item list |
| 84 | +(** [get_parsed_items result] extracts the list of successfully parsed items. |
| 85 | + @param result The parse result |
| 86 | + @return List of parsed BibTeX items *) |
| 87 | + |
| 88 | +(** {2 Pretty Printers} *) |
| 89 | + |
| 90 | +val pretty_print_bibtex : bibtex_item list -> string |
| 91 | +(** [pretty_print_bibtex items] formats a list of BibTeX items into a complete |
| 92 | + BibTeX string. |
| 93 | + @param items List of BibTeX items to format |
| 94 | + @return Complete formatted BibTeX string *) |
| 95 | + |
| 96 | +val clean_bibtex : string -> string |
| 97 | +(** [clean_bibtex input] parses and reformats BibTeX input, effectively cleaning |
| 98 | + and normalizing the formatting. |
| 99 | + @param input The BibTeX content to clean |
| 100 | + @return Cleaned and reformatted BibTeX string *) |
| 101 | + |
| 102 | +(** {2 Utility Functions for custom formatting or editing} *) |
| 103 | + |
| 104 | +val string_of_entry_type : entry_type -> string |
| 105 | +(** [string_of_entry_type entry_type] converts an entry type to its string |
| 106 | + representation (e.g., Article becomes "article"). *) |
| 107 | + |
| 108 | +val entry_type_of_string : string -> entry_type |
| 109 | +(** [entry_type_of_string str] converts a string to an entry type. |
| 110 | + @param str The string representation (case-insensitive) |
| 111 | + @return The corresponding entry type |
| 112 | + @raise Invalid_argument if the string is not a recognized entry type *) |
| 113 | + |
| 114 | +val format_field_value : field_value -> string |
| 115 | +(** [format_field_value value] formats a field value for output. |
| 116 | + @param value The field value to format |
| 117 | + @return String representation of the value *) |
| 118 | + |
| 119 | +val format_field_value_with_url_unescaping : string -> field_value -> string |
| 120 | +(** [format_field_value_with_url_unescaping field_name value] formats a field |
| 121 | + value with URL unescaping and Unicode normalization applied. Special |
| 122 | + handling is applied to URL fields. |
| 123 | + @param field_name |
| 124 | + The name of the field (used to determine if URL processing is needed) |
| 125 | + @param value The field value to format |
| 126 | + @return String representation with URLs unescaped if applicable *) |
| 127 | + |
| 128 | +val format_field : field -> string |
| 129 | +(** [format_field field] formats a complete field (name = value). |
| 130 | + @param field The field to format |
| 131 | + @return String representation of the field *) |
| 132 | + |
| 133 | +val format_entry_content : entry_content -> string |
| 134 | +(** [format_entry_content content] formats entry content (field or comment). |
| 135 | + @param content The entry content to format |
| 136 | + @return String representation of the content *) |
| 137 | + |
| 138 | +val format_entry : bibtex_entry -> string |
| 139 | +(** [format_entry entry] formats a complete BibTeX entry. |
| 140 | + @param entry The entry to format |
| 141 | + @return String representation of the entry *) |
| 142 | + |
| 143 | +val format_bibtex_item : bibtex_item -> string |
| 144 | +(** [format_bibtex_item item] formats a BibTeX item (entry or comment). |
| 145 | + @param item The item to format |
| 146 | + @return String representation of the item *) |
0 commit comments