|
4 | 4 |
|
5 | 5 | import numpy as np |
6 | 6 |
|
7 | | -from wfdb.io import download |
| 7 | +from wfdb.io import download, _coreio |
8 | 8 |
|
9 | 9 |
|
10 | 10 | MAX_I32 = 2147483647 |
|
14 | 14 | ALIGNED_FMTS = ["8", "16", "32", "61", "80", "160"] |
15 | 15 | # Formats in which not all samples align with integer boundaries |
16 | 16 | UNALIGNED_FMTS = ["212", "310", "311", "24"] |
| 17 | +# Formats in which samples are encoded in a variable number of bits |
| 18 | +COMPRESSED_FMTS = ["508", "516", "524"] |
17 | 19 | # Formats which are stored in offset binary form |
18 | 20 | OFFSET_FMTS = ["80", "160"] |
19 | 21 | # All WFDB dat formats - https://www.physionet.org/physiotools/wag/signal-5.htm |
20 | | -DAT_FMTS = ALIGNED_FMTS + UNALIGNED_FMTS |
| 22 | +DAT_FMTS = ALIGNED_FMTS + UNALIGNED_FMTS + COMPRESSED_FMTS |
21 | 23 |
|
22 | 24 | # Bytes required to hold each sample (including wasted space) for each |
23 | 25 | # WFDB dat formats |
|
32 | 34 | "212": 1.5, |
33 | 35 | "310": 4 / 3.0, |
34 | 36 | "311": 4 / 3.0, |
| 37 | + "508": 0, |
| 38 | + "516": 0, |
| 39 | + "524": 0, |
35 | 40 | } |
36 | 41 |
|
37 | 42 | # The bit resolution of each WFDB dat format |
|
46 | 51 | "212": 12, |
47 | 52 | "310": 10, |
48 | 53 | "311": 10, |
| 54 | + "508": 8, |
| 55 | + "516": 16, |
| 56 | + "524": 24, |
49 | 57 | } |
50 | 58 |
|
51 | 59 | # Numpy dtypes used to load dat files of each format. |
|
66 | 74 | # formats. |
67 | 75 | SAMPLE_VALUE_RANGE = { |
68 | 76 | "80": (-(2**7), 2**7 - 1), |
| 77 | + "508": (-(2**7), 2**7 - 1), |
69 | 78 | "310": (-(2**9), 2**9 - 1), |
70 | 79 | "311": (-(2**9), 2**9 - 1), |
71 | 80 | "212": (-(2**11), 2**11 - 1), |
72 | 81 | "16": (-(2**15), 2**15 - 1), |
73 | 82 | "61": (-(2**15), 2**15 - 1), |
74 | 83 | "160": (-(2**15), 2**15 - 1), |
| 84 | + "516": (-(2**15), 2**15 - 1), |
75 | 85 | "24": (-(2**23), 2**23 - 1), |
| 86 | + "524": (-(2**23), 2**23 - 1), |
76 | 87 | "32": (-(2**31), 2**31 - 1), |
77 | 88 | "8": (-(2**31), 2**31 - 1), |
78 | 89 | } |
|
81 | 92 | # WFDB dat formats. |
82 | 93 | INVALID_SAMPLE_VALUE = { |
83 | 94 | "80": -(2**7), |
| 95 | + "508": -(2**7), |
84 | 96 | "310": -(2**9), |
85 | 97 | "311": -(2**9), |
86 | 98 | "212": -(2**11), |
87 | 99 | "16": -(2**15), |
88 | 100 | "61": -(2**15), |
89 | 101 | "160": -(2**15), |
| 102 | + "516": -(2**15), |
90 | 103 | "24": -(2**23), |
| 104 | + "524": -(2**23), |
91 | 105 | "32": -(2**31), |
92 | 106 | "8": None, |
93 | 107 | } |
@@ -1312,6 +1326,18 @@ def _rd_dat_signals( |
1312 | 1326 | # Read values from dat file. Append bytes/samples if needed. |
1313 | 1327 | if no_file: |
1314 | 1328 | data_to_read = sig_data |
| 1329 | + elif fmt in COMPRESSED_FMTS: |
| 1330 | + data_to_read = _rd_compressed_file( |
| 1331 | + file_name=file_name, |
| 1332 | + dir_name=dir_name, |
| 1333 | + pn_dir=pn_dir, |
| 1334 | + fmt=fmt, |
| 1335 | + sample_offset=byte_offset, |
| 1336 | + n_sig=n_sig, |
| 1337 | + samps_per_frame=samps_per_frame, |
| 1338 | + start_frame=sampfrom, |
| 1339 | + end_frame=sampto, |
| 1340 | + ) |
1315 | 1341 | else: |
1316 | 1342 | data_to_read = _rd_dat_file( |
1317 | 1343 | file_name, dir_name, pn_dir, fmt, start_byte, n_read_samples |
@@ -1747,6 +1773,162 @@ def _blocks_to_samples(sig_data, n_samp, fmt): |
1747 | 1773 | return sig |
1748 | 1774 |
|
1749 | 1775 |
|
| 1776 | +def _rd_compressed_file( |
| 1777 | + file_name, |
| 1778 | + dir_name, |
| 1779 | + pn_dir, |
| 1780 | + fmt, |
| 1781 | + sample_offset, |
| 1782 | + n_sig, |
| 1783 | + samps_per_frame, |
| 1784 | + start_frame, |
| 1785 | + end_frame, |
| 1786 | +): |
| 1787 | + """ |
| 1788 | + Read data from a compressed file into a 1D numpy array. |
| 1789 | +
|
| 1790 | + Parameters |
| 1791 | + ---------- |
| 1792 | + file_name : str |
| 1793 | + The name of the signal file. |
| 1794 | + dir_name : str |
| 1795 | + The full directory where the signal file is located, if local. |
| 1796 | + This argument is ignored if `pn_dir` is not None. |
| 1797 | + pn_dir : str or None |
| 1798 | + The PhysioNet database directory where the signal file is located. |
| 1799 | + fmt : str |
| 1800 | + The format code of the signal file. |
| 1801 | + sample_offset : int |
| 1802 | + The sample number in the signal file corresponding to sample 0 of |
| 1803 | + the WFDB record. |
| 1804 | + n_sig : int |
| 1805 | + The number of signals in the file. |
| 1806 | + samps_per_frame : list |
| 1807 | + The number of samples per frame for each signal in the file. |
| 1808 | + start_frame : int |
| 1809 | + The starting frame number to read. |
| 1810 | + end_frame : int |
| 1811 | + The ending frame number to read. |
| 1812 | +
|
| 1813 | + Returns |
| 1814 | + ------- |
| 1815 | + signal : ndarray |
| 1816 | + The data read from the signal file. This is a one-dimensional |
| 1817 | + array in the same order the samples would be stored in a binary |
| 1818 | + signal file; `signal[(i*n_sig+j)*samps_per_frame[0]+k]` is sample |
| 1819 | + number `i*samps_per_frame[0]+k` of signal `j`. |
| 1820 | +
|
| 1821 | + Notes |
| 1822 | + ----- |
| 1823 | + Converting the output array into "dat file order" here is inefficient, |
| 1824 | + but necessary to match the behavior of _rd_dat_file. It would be |
| 1825 | + better to reorganize _rd_dat_signals to make the reshaping unnecessary. |
| 1826 | +
|
| 1827 | + """ |
| 1828 | + import soundfile |
| 1829 | + |
| 1830 | + if any(spf != samps_per_frame[0] for spf in samps_per_frame): |
| 1831 | + raise ValueError( |
| 1832 | + "All channels in a FLAC signal file must have the same " |
| 1833 | + "sampling rate and samples per frame" |
| 1834 | + ) |
| 1835 | + |
| 1836 | + if pn_dir is None: |
| 1837 | + file_name = os.path.join(dir_name, file_name) |
| 1838 | + |
| 1839 | + with _coreio._open_file(pn_dir, file_name, "rb") as fp: |
| 1840 | + signature = fp.read(4) |
| 1841 | + if signature != b"fLaC": |
| 1842 | + raise ValueError(f"{fp.name} is not a FLAC file") |
| 1843 | + fp.seek(0) |
| 1844 | + |
| 1845 | + with soundfile.SoundFile(fp) as sf: |
| 1846 | + # Determine the actual resolution of the FLAC stream and the |
| 1847 | + # data type will use when reading it. Note that soundfile |
| 1848 | + # doesn't support int8. |
| 1849 | + if sf.subtype == "PCM_S8": |
| 1850 | + format_bits = 8 |
| 1851 | + read_dtype = "int16" |
| 1852 | + elif sf.subtype == "PCM_16": |
| 1853 | + format_bits = 16 |
| 1854 | + read_dtype = "int16" |
| 1855 | + elif sf.subtype == "PCM_24": |
| 1856 | + format_bits = 24 |
| 1857 | + read_dtype = "int32" |
| 1858 | + else: |
| 1859 | + raise ValueError(f"unknown subtype in {fp.name} ({sf.subtype})") |
| 1860 | + |
| 1861 | + max_bits = int(fmt) - 500 |
| 1862 | + if format_bits > max_bits: |
| 1863 | + raise ValueError( |
| 1864 | + f"wrong resolution in {fp.name} " |
| 1865 | + f"({format_bits}, expected <= {max_bits})" |
| 1866 | + ) |
| 1867 | + |
| 1868 | + if sf.channels != n_sig: |
| 1869 | + raise ValueError( |
| 1870 | + f"wrong number of channels in {fp.name} " |
| 1871 | + f"({sf.channels}, expected {n_sig})" |
| 1872 | + ) |
| 1873 | + |
| 1874 | + # Read the samples. |
| 1875 | + start_samp = start_frame * samps_per_frame[0] |
| 1876 | + end_samp = end_frame * samps_per_frame[0] |
| 1877 | + sf.seek(start_samp + sample_offset) |
| 1878 | + sig_data = sf.read(end_samp - start_samp, dtype=read_dtype) |
| 1879 | + |
| 1880 | + # If we read an 8-bit stream as int16 or a 24-bit stream as |
| 1881 | + # int32, soundfile shifts each sample left by 8 bits. We |
| 1882 | + # want to undo this shift (and, in the case of 8-bit data, |
| 1883 | + # convert to an int8 array.) |
| 1884 | + if format_bits == 8: |
| 1885 | + # np.right_shift(sig_data, 8, dtype='int8') doesn't work. |
| 1886 | + # This seems wrong, but the numpy documentation is unclear. |
| 1887 | + sig_data2 = np.empty(sig_data.shape, dtype="int8") |
| 1888 | + sig_data = np.right_shift(sig_data, 8, out=sig_data2) |
| 1889 | + elif format_bits == 24: |
| 1890 | + # Shift 32-bit array in-place. |
| 1891 | + np.right_shift(sig_data, 8, out=sig_data) |
| 1892 | + |
| 1893 | + # Suppose we have 3 channels and 2 samples per frame. The array |
| 1894 | + # returned by sf.read looks like this: |
| 1895 | + # |
| 1896 | + # channel 0 channel 1 channel 2 |
| 1897 | + # time 0 [0,0] [0,1] [0,2] |
| 1898 | + # time 1 [1,0] [1,1] [1,2] |
| 1899 | + # time 2 [2,0] [2,1] [2,2] |
| 1900 | + # time 3 [3,0] [3,1] [3,2] |
| 1901 | + # |
| 1902 | + # We reshape this first into the following: |
| 1903 | + # |
| 1904 | + # channel 0 channel 1 channel 2 |
| 1905 | + # time 0 [0,0,0] [0,0,1] [0,0,2] |
| 1906 | + # time 1 [0,1,0] [0,1,1] [0,1,2] |
| 1907 | + # time 2 [1,0,0] [1,0,1] [1,0,2] |
| 1908 | + # time 3 [1,1,0] [1,1,1] [1,1,2] |
| 1909 | + # |
| 1910 | + # Then we transpose axes 1 and 2: |
| 1911 | + # |
| 1912 | + # channel 0 channel 1 channel 2 |
| 1913 | + # time 0 [0,0,0] [0,1,0] [0,2,0] |
| 1914 | + # time 1 [0,0,1] [0,1,1] [0,2,1] |
| 1915 | + # time 2 [1,0,0] [1,1,0] [1,2,0] |
| 1916 | + # time 3 [1,0,1] [1,1,1] [1,2,1] |
| 1917 | + # |
| 1918 | + # Then when we reshape the array to 1D, the result is in dat file |
| 1919 | + # order: |
| 1920 | + # |
| 1921 | + # channel 0 channel 1 channel 2 |
| 1922 | + # time 0 [0] [2] [4] |
| 1923 | + # time 1 [1] [3] [5] |
| 1924 | + # time 2 [6] [8] [10] |
| 1925 | + # time 3 [7] [9] [11] |
| 1926 | + |
| 1927 | + sig_data = sig_data.reshape(-1, samps_per_frame[0], n_sig) |
| 1928 | + sig_data = sig_data.transpose(0, 2, 1) |
| 1929 | + return sig_data.reshape(-1) |
| 1930 | + |
| 1931 | + |
1750 | 1932 | def _skew_sig( |
1751 | 1933 | sig, skew, n_sig, read_len, fmt, nan_replace, samps_per_frame=None |
1752 | 1934 | ): |
|
0 commit comments