Swapping out pandas

bsipocz · bsipocz · commit 18dc42b473ac · 2025-03-31T20:42:55.000-07:00
diff --git a/tutorials/euclid_access/1_Euclid_intro_MER_images.md b/tutorials/euclid_access/1_Euclid_intro_MER_images.md
@@ -65,7 +65,6 @@ Each MER image is approximately 1.47 GB. Downloading can take some time.
 import re
 
 import numpy as np
-import pandas as pd
 
 import matplotlib.pyplot as plt
 from matplotlib.patches import Ellipse
@@ -80,9 +79,6 @@ from astropy import units as u
 
 from astroquery.ipac.irsa import Irsa
 import sep
-
-# Copy-on-write is more performant and avoids unexpected modifications of the original DataFrame.
-pd.options.mode.copy_on_write = True
 ```
 
 ## 1. Search for multiwavelength Euclid Q1 MER mosaics that cover the star HD 168151
@@ -123,20 +119,23 @@ science_images
 Note that 'access_estsize' is in units of kb
 
 ```{code-cell} ipython3
-filename = science_images[science_images['energy_bandpassname']=='VIS']['access_url'][0]
-filesize = science_images[science_images['energy_bandpassname']=='VIS']['access_estsize'][0]/1000000
-
+filename = science_images[science_images['energy_bandpassname'] == 'VIS']['access_url'][0]
+filesize = science_images[science_images['energy_bandpassname'] == 'VIS']['access_estsize'][0] / 1000000
 print(filename)
 
 print(f'Please note this image is {filesize} GB. With 230 Mbps internet download speed, it takes about 1 minute to download.')
 ```
 
+```{code-cell} ipython3
+science_images
+```
+
 ### Extract the tileID of this image from the filename
 
 ```{code-cell} ipython3
-tileID=re.search(r'TILE\s*(\d{9})', filename).group(1)
+tileID = science_images[science_images['energy_bandpassname'] == 'VIS']['obs_id'][0][:9]
 
-print('The MER tile ID for this object is :',tileID)
+print(f'The MER tile ID for this object is : {tileID}')
 ```
 
 Retrieve the MER image -- note this file is about 1.46 GB
@@ -146,7 +145,7 @@ fname = download_file(filename, cache=True)
 hdu_mer_irsa = fits.open(fname)
 print(hdu_mer_irsa.info())
 
-head_mer_irsa = hdu_mer_irsa[0].header
+header_mer_irsa = hdu_mer_irsa[0].header
 ```
 
 If you would like to save the MER mosaic to disk, uncomment the following cell.
@@ -160,21 +159,22 @@ Please also define a suitable download directory; by default it will be `data` a
 Have a look at the header information for this image.
 
 ```{code-cell} ipython3
-head_mer_irsa
+header_mer_irsa
 ```
 
 Lets extract just the primary image.
 
 ```{code-cell} ipython3
-im_mer_irsa=hdu_mer_irsa[0].data
+im_mer_irsa = hdu_mer_irsa[0].data
 
 print(im_mer_irsa.shape)
 ```
 
 Due to the large field of view of the MER mosaic, let's cut out a smaller section (2"x2")of the MER mosaic to inspect the image
 
 ```{code-cell} ipython3
-plt.imshow(im_mer_irsa[0:1200,0:1200], cmap='gray', origin='lower', norm=ImageNormalize(im_mer_irsa[0:1200,0:1200], interval=PercentileInterval(99.9), stretch=AsinhStretch()))
+plt.imshow(im_mer_irsa[0:1200,0:1200], cmap='gray', origin='lower',
+           norm=ImageNormalize(im_mer_irsa[0:1200,0:1200], interval=PercentileInterval(99.9), stretch=AsinhStretch()))
 colorbar = plt.colorbar()
 ```
 
@@ -203,21 +203,20 @@ urls
 Create an array with the instrument and filter name so we can add this to the plots.
 
 ```{code-cell} ipython3
-df_im_euclid.loc[:, "filters"] = df_im_euclid["instrument_name"] + "_" + df_im_euclid["energy_bandpassname"]
+science_images['filters'] = science_images['instrument_name'] + "_" + science_images['energy_bandpassname']
 
-## Note that VIS_VIS appears in the filters, so update that filter to just say VIS
-df_im_euclid.loc[df_im_euclid["filters"] == "VIS_VIS", "filters"] = "VIS"
+# VIS_VIS appears in the filters, so update that filter to just say VIS
+science_images['filters'][science_images['filters']== 'VIS_VIS'] = "VIS"
 
-filters = df_im_euclid['filters'].to_numpy()
-filters
+science_images['filters']
 ```
 
 ## The image above is very large, so let's cut out a smaller image to inspect these data.
 
 ```{code-cell} ipython3
 ######################## User defined section ############################
 ## How large do you want the image cutout to be?
-im_cutout= 1.0 * u.arcmin
+im_cutout = 1.0 * u.arcmin
 
 ## What is the center of the cutout?
 ## For now choosing a random location on the image
@@ -229,7 +228,7 @@ dec =  64.525
 # ra = 273.474451
 # dec = 64.397273
 
-coords_cutout = SkyCoord(ra, dec, unit=(u.deg, u.deg), frame='icrs')
+coords_cutout = SkyCoord(ra, dec, unit='deg', frame='icrs')
 
 ##########################################################################
 
@@ -296,13 +295,9 @@ plt.show()
 First we list all the filters so you can choose which cutout you want to extract sources on. We will choose VIS.
 
 ```{code-cell} ipython3
-filters
-```
-
-```{code-cell} ipython3
-filt_index = np.where(filters == 'VIS')[0][0]
+filt_index = np.where(science_images['filters'] == 'VIS')[0][0]
 
-img1=final_hdulist[filt_index].data
+img1 = final_hdulist[filt_index].data
 ```
 
 ### Extract some sources from the cutout using sep (python package based on source extractor)
diff --git a/tutorials/euclid_access/5_Euclid_intro_SPE_catalog.md b/tutorials/euclid_access/5_Euclid_intro_SPE_catalog.md
@@ -50,7 +50,7 @@ If you have questions about this notebook, please contact the [IRSA helpdesk](ht
 
 ```{code-cell} ipython3
 # Uncomment the next line to install dependencies if needed
-# !pip install matplotlib pandas astropy 'astroquery>=0.4.10'
+# !pip install matplotlib astropy 'astroquery>=0.4.10'
 ```
 
 ```{code-cell} ipython3
@@ -59,14 +59,13 @@ import urllib
 
 import matplotlib.pyplot as plt
 import numpy as np
-import pandas as pd
 
 from astropy.coordinates import SkyCoord
 from astropy.io import fits
-from astropy.table import Table
+from astropy.table import QTable
 from astropy import units as u
 from astropy.utils.data import download_file
-from astropy.visualization import ImageNormalize, PercentileInterval, AsinhStretch
+from astropy.visualization import ImageNormalize, PercentileInterval, AsinhStretch, quantity_support
 
 from astroquery.ipac.irsa import Irsa
 ```
@@ -80,41 +79,41 @@ search_radius = 10 * u.arcsec
 coord = SkyCoord.from_name('HD 168151')
 ```
 
-### Use IRSA to search for all Euclid data on this target
+```{tip}
+The IRSA SIA collections can be listed using using the ``list_collections`` method, we can filter on the ones containing "euclid" in the collection name:
 
-This searches specifically in the euclid_DpdMerBksMosaic "collection" which is the MER images and catalogs.
+    Irsa.list_collections(filter='euclid')
+```
 
-```{code-cell} ipython3
-im_table = Irsa.query_sia(pos=(coord, search_radius), collection='euclid_DpdMerBksMosaic')
++++
 
-## Convert the table to pandas dataframe
-df_im_irsa=im_table.to_pandas()
-```
+### Use IRSA to search for all Euclid data on this target
+
+This searches specifically in the ``euclid_DpdMerBksMosaic`` collection which is the MER images and catalogs.
 
 ```{code-cell} ipython3
-## Change the settings so we can see all the columns in the dataframe and the full column width
-## (to see the full long URL)
-pd.set_option('display.max_columns', None)
-pd.set_option('display.max_colwidth', None)
+image_table = Irsa.query_sia(pos=(coord, search_radius), collection='euclid_DpdMerBksMosaic')
 ```
 
-#### This dataframe contains other non-Euclid datasets that have been "Euclidized", meaning they have been put on the same pixel scale as the Euclid data. For this example we just want to look at the Euclid data, so select Euclid for the facility name, and choose science as the data product subtype.
+This table lists all MER mosaic images available in this search position. These mosaics include the Euclid VIS, Y, J, H images, as well as ground-based telescopes which have been put on the same pixel scale. For more information, see the [Euclid documentation at IPAC](https://euclid.caltech.edu/page/euclid-faq-tech/).
 
-```{code-cell} ipython3
-df_im_euclid=df_im_irsa[ (df_im_irsa['dataproduct_subtype']=='science') &  (df_im_irsa['facility_name']=='Euclid')]
+Note that there are various image types are returned as well, we filter out the `science` images from these:
 
-df_im_euclid.head()
+```{code-cell} ipython3
+science_images = image_table[image_table['dataproduct_subtype'] == 'science']
+science_images
 ```
 
-## Choose the VIS image and pull the filename:
+### Choose the VIS image and pull the Tile ID
 
-```{code-cell} ipython3
-filename=df_im_euclid[df_im_euclid['energy_bandpassname']=='VIS']['access_url'].to_list()[0]
++++
 
-# ## Extract the tileID from the filename
-tileID=re.search(r'TILE\s*(\d{9})', filename).group(1)
+Extract the tile ID from the ``obs_id`` column. The values in this column are made a combination of the 9 digit tile ID and the abbreviation of the instrument.
 
-print('The MER tile ID for this object is :',tileID)
+```{code-cell} ipython3
+tileID = science_images[science_images['energy_bandpassname'] == 'VIS']['obs_id'][0][:9]
+
+print(f'The MER tile ID for this object is : {tileID}')
 ```
 
 ## 2. Download SPE catalog from IRSA directly to this notebook
@@ -137,10 +136,14 @@ table_lines = 'euclid_q1_spe_lines_line_features'
 - List the column names
 
 ```{code-cell} ipython3
-columns_info = Irsa.list_columns(catalog=table_lines)
+columns_info = Irsa.list_columns(catalog=table_mer)
 print(len(columns_info))
 ```
 
+```{code-cell} ipython3
+Irsa.list_columns(catalog=table_1dspectra, full=True)
+```
+
 ```{code-cell} ipython3
 # Full list of columns and their description
 columns_info
@@ -159,89 +162,87 @@ We specify the following conditions on our search:
 Finally we sort the data by descending spe_line_snr_gf to have the largest SNR H-alpha lines detected at the top.
 
 ```{code-cell} ipython3
-adql = f"SELECT DISTINCT mer.object_id,mer.ra, mer.dec, mer.tileid, mer.flux_y_templfit, \
-lines.spe_line_snr_gf,lines.spe_line_snr_di, lines.spe_line_name, lines.spe_line_central_wl_gf,\
-lines.spe_line_ew_gf, galaxy.spe_z_err, galaxy.spe_z,galaxy.spe_z_prob, lines.spe_line_flux_gf, lines.spe_line_flux_err_gf \
-FROM {table_mer} AS mer \
-JOIN {table_lines} AS lines \
-ON mer.object_id = lines.object_id \
-JOIN {table_galaxy_candidates} AS galaxy \
-ON lines.object_id = galaxy.object_id AND lines.spe_rank = galaxy.spe_rank \
-WHERE lines.spe_line_snr_gf >5 \
-AND lines.spe_line_name = 'Halpha' \
-AND mer.tileid = {tileID} \
-AND galaxy.spe_z_prob > 0.99 \
-AND galaxy.spe_z BETWEEN 1.4 AND 1.6 \
-AND lines.spe_line_flux_gf > 2E-16 \
-ORDER BY lines.spe_line_snr_gf DESC \
-"
+adql_query = ("SELECT DISTINCT mer.object_id,mer.ra, mer.dec, mer.tileid, mer.flux_y_templfit, "
+    "lines.spe_line_snr_gf,lines.spe_line_snr_di, lines.spe_line_name, lines.spe_line_central_wl_gf, "
+    "lines.spe_line_ew_gf, galaxy.spe_z_err, galaxy.spe_z,galaxy.spe_z_prob, "
+    "lines.spe_line_flux_gf, lines.spe_line_flux_err_gf "
+    f"FROM {table_mer} AS mer "
+    f"JOIN {table_lines} AS lines "
+    "ON mer.object_id = lines.object_id "
+    f"JOIN {table_galaxy_candidates} AS galaxy "
+    "ON lines.object_id = galaxy.object_id AND lines.spe_rank = galaxy.spe_rank "
+    "WHERE lines.spe_line_snr_gf >5 "
+    "AND lines.spe_line_name = 'Halpha' "
+    f"AND mer.tileid = {tileID} "
+    "AND galaxy.spe_z_prob > 0.99 "
+    "AND galaxy.spe_z BETWEEN 1.4 AND 1.6 "
+    "AND lines.spe_line_flux_gf > 2E-16 "
+    "ORDER BY lines.spe_line_snr_gf DESC ")
 
 # Use TAP with this ADQL string
-result = Irsa.query_tap(adql)
-
-# Convert table to pandas dataframe and drop duplicates
-result_table = result.to_qtable()
+result_table = Irsa.query_tap(adql_query).to_qtable()
 
 result_table['spe_line_flux_gf'].info.format = ".8e"  # Scientific notation with 8 decimal places
 result_table['spe_line_flux_err_gf'].info.format = ".8e"
-result_table['object_id'] = result['object_id'].astype('int64')
+result_table['object_id'] = result_table['object_id'].astype('int64')
 ```
 
 ### Choose an object of interest, lets look at an object with a strong Halpha line detected with high SNR.
 
 ```{code-cell} ipython3
 obj_id = 2737659721646729968
 
-obj_tab = result_table[(result_table['object_id'] == obj_id)]
+obj_row = result_table[(result_table['object_id'] == obj_id)]
 
-obj_tab
+obj_row
 ```
 
 ### Pull the spectrum of this object
 
 ```{code-cell} ipython3
 adql_object = f"SELECT *  FROM {table_1dspectra}  WHERE objectid = {obj_id}"
 
-result2 = Irsa.query_tap(adql_object)
-df2 = result2.to_table().to_pandas()
-df2
+result_table2 = Irsa.query_tap(adql_object).to_qtable()
 ```
 
 ### The following steps to read in the spectrum follows the 3_Euclid_intro_1D_spectra notebook.
 
 This involves reading in the spectrum without readin in the full FITS file, just pulling the extension we want.
 
 ```{code-cell} ipython3
-file_uri = urllib.parse.urljoin(Irsa.tap_url, result2['uri'][0])
+file_uri = urllib.parse.urljoin(Irsa.tap_url, result_table2['uri'][0])
 file_uri
 ```
 
 ```{code-cell} ipython3
 with fits.open(file_uri) as hdul:
-    hdu = hdul[df2['hdu'].iloc[0]]
-    dat = Table.read(hdu, format='fits', hdu=1)
-    df_obj_irsa = dat.to_pandas()
+    spectrum = QTable.read(hdul[result_table2['hdu'][0]], format='fits')
+    spec_header = hdul[result_table2['hdu'][0]].header
 ```
 
 ### Now the data are read in, plot the spectrum with the H-alpha line labeled
 
-Divide by 10000 to convert from Angstrom to micron
+```{tip}
+As we use astropy.visualization's ``quantity_support``, matplotlib automatically picks up the axis units from the quantitites we plot.
+```
+
+```{code-cell} ipython3
+quantity_support()
+```
 
 ```{code-cell} ipython3
-wavelengths = obj_tab['spe_line_central_wl_gf']/10000.
-line_names = obj_tab['spe_line_name']
-snr_gf = obj_tab['spe_line_snr_gf']
+# Note that the units are missing from the lines table, we manually add Angstrom
+line_wavelengths = obj_row['spe_line_central_wl_gf'] * u.angstrom
+line_names = obj_row['spe_line_name']
+snr_gf = obj_row['spe_line_snr_gf']
 
-plt.plot(df_obj_irsa['WAVELENGTH']/10000., df_obj_irsa['SIGNAL'])
+plt.plot(spectrum['WAVELENGTH'].to(u.micron), spectrum['SIGNAL'])
 
-for wl, name, snr in zip(np.atleast_1d(wavelengths), np.atleast_1d(line_names), np.atleast_1d(snr_gf)):
+for wl, name, snr in zip(np.atleast_1d(line_wavelengths), np.atleast_1d(line_names), np.atleast_1d(snr_gf)):
     plt.axvline(wl, color='b', linestyle='--', alpha=0.3)
-    plt.text(wl+0.02, .2, name+' SNR='+str(round(snr)), rotation=90, ha='center', va='bottom', fontsize=10)
+    plt.text(wl, .2, name+' SNR='+str(round(snr)), rotation=90, ha='center', va='bottom', fontsize=10)
 
-plt.xlabel('Wavelength (microns)')
-plt.ylabel('Flux (erg / (s cm2))')
-plt.xlim(1.25, 1.85)
-plt.title('Object ID is '+str(obj_id))
+plt.title(f'Object ID {obj_id}')
 ```
 
 ## About this Notebook