diff --git a/documentation.md b/documentation.md new file mode 100644 index 0000000000000..53e34ed2e46a8 --- /dev/null +++ b/documentation.md @@ -0,0 +1,84 @@ +## 1. Project Plan for Testing, Implementation, and Validation + +### 1.1 Team Roles and Responsibilities +Our team consists of **Ahmed, Herdi, Maxim, Annika, and Kim**, with roles distributed as follows: + +| **Team Member** | **Role** | **Responsibilities** | +|----------------|---------|-----------------| +| **Ahmed** | Issue Resolution & Implementation | Implementing fixes for new date formats in `Period` class and modifying `parse_time_string` to integrate logic for these formats. | +| **Herdi** | Repository Preparation, Test Execution & Additional Fixes | Setting up the environment, running the full test suite, integrating fixes, and handling additional bug fixes related to `Period` class behavior. | +| **Maxim, Annika, Kim** | Testing Team | Writing, documenting, and structuring test cases in `test_period.py`. Running and validating test cases before and after implementation. Ensuring coverage analysis is performed. | + +--- + +### 1.2 Current Project Plan +This project plan outlines how we implemented, tested, and validated the new date formats and additional fixes before merging into pandas. + +#### 1.2.1 Issue Overview +We have implemented four new date formats in the `Period` class: + +1. **ISO 8601 Ordinal Dates** (e.g., `"1981-095"` → Interpreted as April 5, 1981). +2. **Multi-Year Spans** (e.g., `"2019-2021"` → Represents the range of years 2019 to 2021). +3. **Week Start-End Ranges** (e.g., `"20170123-20170129"` → Interpreted as a full week). +4. **Quarter-Based Multi-Year Periods** (e.g., `"2023Q1-2024Q3"` → Generates a quarterly range). +5. **`DateParseError` on weeks from the 24th century onwards** +6. **`DateParseError` on weeks in the 60s, 70s, 80s, or 90s** of any century. +7. **Correcting `freq` misinterpretations**, ensuring that hours are no longer mistaken for minutes in certain string formats. + +--- + +#### 1.2.2 Implementation, Testing, and Validation +The **implementation, testing, and validation** of the new formats and fixes were conducted **in parallel** by the team. All team members contributed to different aspects of development, and the documentation was **iteratively revised and refined** after testing. + +##### **Test Preparation (Before Implementation)** +- The **testing team (Maxim, Annika, and Kim)** worked on: + - Writing **test cases** covering the issue requirements/features. + - Documenting test cases, describing **what is tested and how they connect** to the feature. + - Adding test descriptions as **issue documentation** and **method docstrings** in `test_period.py`. + - Executing **test cases to confirm failure before implementation**, validating that the issue exists. + +##### **Feature Implementation** +- **Ahmed** worked on: + - Modifying `Period` and `parse_time_string` to **support the new formats**. + - Pushing the fixes to a **feature branch** for testing. +- **Herdi** worked on: + - **Integrating the fixes** into the repository. + - Ensuring **smooth compatibility** with existing pandas functionality. + - **Implementing fixes for additional issues**, including `DateParseErrors` and frequency misinterpretations. + +##### **Test Execution (After Implementation)** +- The **testing team**: + - Executed the **full pandas test suite** to verify that new changes do not break existing functionality. + - Ran the **new test cases** to ensure they **passed after implementation**. + +##### **Coverage and Final Validation** +- The **testing team** analyzed: + - **Test coverage**, ensuring that **key execution paths** were tested. + - Re-ran the **full test suite** with `pytest --cov` to verify coverage levels. + +--- + +### 1.2.3. Documentation & Refinement +Throughout the project, **all team members contributed** to the documentation, ensuring that it accurately described the issue, testing strategy, and implementation details. After completing the implementation and testing, the documentation was **revised and refined** to reflect the final methodology and test results. + +#### **Key Documentation Deliverables:** +- **Issue documentation** describing **requirements, test cases, and feature mapping**. +- **Test case descriptions** added to **test_period.py**. +- Final **test results and coverage analysis documentation**. +- **Structured patch submitted** to pandas for review. + +--- +### 1.3 **Future Project Plan: Enhancing `Period` Class** + +#### **Planned Features for Next Iteration** +- **Business Years Not Starting in January** + - Support fiscal years with **custom start months** (e.g., `"2023FY-Apr"`). + - Modify `Period` parsing to recognize and handle non-January year starts. + +- **Quarters Starting in Custom Months** + - Extend quarter parsing to support **non-standard starts** (e.g., `"2024Q1-Feb"`). + - Ensure consistency with fiscal calendars and existing pandas behavior. + +- **Business Days Support** + - Introduce **business day-based periods** (e.g., `"2023-05-BD"`). + - Align with pandas’ existing **BusinessDay (`BD`) frequency**. diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 308183402198d..78facaf31dc73 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -4,6 +4,7 @@ Parsing functions for datetime and datetime-like strings. import re import time import warnings +import pandas as pd from pandas.util._exceptions import find_stack_level @@ -15,7 +16,8 @@ from cpython.datetime cimport ( tzinfo, ) -from datetime import timezone +from datetime import timezone, datetime + from cpython.unicode cimport PyUnicode_AsUTF8AndSize from cython cimport Py_ssize_t @@ -756,6 +758,59 @@ cdef _find_subsecond_reso(str timestr, int64_t* nanos): return reso +# Parsing for iso_ordinal date, multi-year period, quarter-based multi-year period, and multi-week period +# ---------------------------------------------------------------------- +def parse_time_string(time_str, freq=None): + """ + Extended parsing logic to handle: + 1. ISO 8601 ordinal dates (e.g., "1981-095"). + 2. Multi-year spans (e.g., "2019-2021"). + 3. Multi-quarter spans (e.g., "2019Q1-2021Q4"). + """ + + # Handle ISO 8601 Ordinal Dates (YYYY-DDD) + ordinal_match = re.match(r"^(\d{4})-(\d{3})$", time_str) + + # Handle Multi-Year Spans (e.g., "2019-2021") + multi_year_match = re.match(r"^(\d{4})-(\d{4})$", time_str) + + # Handle Multi-Quarter Spans (e.g., "2019Q1-2021Q4") + multi_quarter_match = re.match(r"^(\d{4}Q[1-4])-(\d{4}Q[1-4])$", time_str) + + # Handle Week Start-End Format (YYYYMMDD-YYYYMMDD) + week_match = re.match(r"^(\d{8})-(\d{8})$", time_str) + + + if ordinal_match: + try: + year, day_of_year = map(int, ordinal_match.groups()) + return pd.Period(pd.Timestamp(f"{year}-01-01") + pd.Timedelta(days=day_of_year - 1), freq="D") + except ValueError: + return None # Invalid ordinal date + + elif multi_year_match: + start_year, end_year = map(int, multi_year_match.groups()) + if start_year <= end_year: # Ensure valid range + return pd.period_range(start=f"{start_year}", end=f"{end_year}", freq="Y") + return None # Invalid range + + elif multi_quarter_match: + start_q, end_q = multi_quarter_match.groups() + return pd.period_range(start=start_q, end=end_q, freq="Q") + + elif week_match: + start_date, end_date = week_match.groups() + start = pd.Timestamp(start_date) + end = pd.Timestamp(end_date) + + # Ensure the range actually covers a full week (7 days) + if (end - start).days == 6: + return pd.Period(start, freq="W") + + + return None # No match found + + # ---------------------------------------------------------------------- # Parsing for type-inference diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index bef1956996b4f..14d92b4d18ae1 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -37,7 +37,7 @@ from libc.time cimport ( strftime, tm, ) - +from pandas._libs.tslibs.parsing import parse_time_string from pandas._libs.tslibs.dtypes cimport c_OFFSET_TO_PERIOD_FREQSTR from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime @@ -2895,7 +2895,12 @@ class Period(_Period): # ('min', 5) but may be passed in as a string like '5min' # ordinal is the period offset from the gregorian proleptic epoch - + + if isinstance(value, str): + parsed = parse_time_string(value, freq) + if parsed is not None: + return parsed # Use the parsed Period range if matched + if freq is not None: freq = cls._maybe_convert_freq(freq) try: