Source code for pytups.core.Dataset

__all__ = ['Dataset']

import requests
import pandas as pd
import warnings
from .NOAADataset import NOAADataset
from ..utils.helpers import assert_list
from ..utils.Parser.StandardParser import DataFetcher, StandardParser

[docs] class Dataset: """ A wrapper class for interacting with the NOAA Studies API. Manages the retrieval, parsing, and aggregation of NOAA study data, and provides methods to access summaries, publications, sites, and external data files. Attributes ---------- BASE_URL : str The NOAA API endpoint URL. studies : dict A mapping from NOAADatasetId to NOAADataset instances. data_table_index : dict A mapping from dataTableID to associated study, site, and paleo data. Methods ------- __init__() Initializes the Dataset. search_studies(...) Searches for studies using provided parameters and parses the response. _fetch_api(params) Internal method to make an HTTP GET request to the NOAA API. _parse_response(data) Internal method to parse the JSON response and populate studies. get_summary_dataframe() Returns a DataFrame summarizing all loaded studies. get_publications_dataframe() Returns a DataFrame of publications aggregated from studies. get_sites_dataframe() Returns a DataFrame of sites aggregated from studies. get_data(dataTableIDs, file_urls) Fetches and returns external data based on data table IDs or file URLs. """ BASE_URL = "https://www.ncei.noaa.gov/access/paleo-search/study/search.json" _PROPRIETARY_TYPES = {'crn', 'rwl', 'fhx', 'lpd'}
[docs] def __init__(self): """ Initialize the Dataset instance. Attributes are set to their default empty values. """ self.studies = {} # NOAADatasetId -> NOAADataset instance self.data_table_index = {} # dataTableID -> dict with study, site, paleo_data self.file_url_to_datatable = {} # file_url -> dataTableID
[docs] def search_studies(self, xml_id=None, noaa_id=None, data_publisher="NOAA", data_type_id=None, keywords=None, investigators=None, max_lat=None, min_lat=None, max_lon=None, min_lon=None, location=None, publication=None, search_text=None, earliest_year=None, latest_year=None, cv_whats=None, recent=False): """ Search for NOAA studies using the provided parameters. At least one parameter must be specified for a search to be initiated. Parameters ---------- xml_id : str, optional XML identifier for a study. noaa_id : str, optional NOAA study identifier. data_publisher : str, optional Publisher of the data, default is "NOAA". data_type_id : str, optional Data type identifier. keywords : str, optional Keywords for the search. investigators : str, optional Investigator names. max_lat : float, optional Maximum latitude. min_lat : float, optional Minimum latitude. max_lon : float, optional Maximum longitude. min_lon : float, optional Minimum longitude. location : str, optional Location description. publication : str, optional Publication details. search_text : str, optional Additional text to search within the study. earliest_year : int, optional Earliest year of study. latest_year : int, optional Latest year of study. cv_whats : str, optional Controlled vocabulary term. recent : bool, optional Flag to filter recent studies. Returns ------- None The method populates internal attributes with the retrieved data. Requires at least one single parameter. Parameter validation to be implemented soon. """ if noaa_id: params = {'NOAADatasetId': noaa_id} elif xml_id: params = {'xmlId': xml_id} else: params = { 'dataPublisher': data_publisher, 'dataTypeId': data_type_id, 'keywords': keywords, 'investigators': investigators, 'minLat': min_lat, 'maxLat': max_lat, 'minLon': min_lon, 'maxLon': max_lon, 'locations': location, 'searchText': search_text, 'cvWhats': cv_whats, 'earliestYear': earliest_year, 'latestYear': latest_year, 'recent': recent, } params = {k: v for k, v in params.items() if v is not None} response_json = self._fetch_api(params) self._parse_response(response_json) self.get_summary_dataframe()
[docs] def _fetch_api(self, params): """ Fetch data from the NOAA API using the given parameters. Parameters ---------- params : dict A dictionary of query parameters. Returns ------- dict The JSON response from the NOAA API. Raises ------ Exception If the API response status is not 200. """ response = requests.get(self.BASE_URL, params=params) if response.status_code == 200: return response.json() else: raise Exception(f"Error fetching studies: {response.status_code}")
[docs] def _parse_response(self, data): """ Parse the JSON response and populate the internal studies and data_table_index. Parameters ---------- data : dict The JSON data returned from the NOAA API. Returns ------- None """ self.studies.clear() self.data_table_index.clear() self.file_url_to_datatable.clear() # self.sites.clear() for study_data in data.get('study', []): study_obj = NOAADataset(study_data) self.studies[study_obj.study_id] = study_obj # print(study_obj.study_id) # Process each site in the study. for site in study_obj.sites: # self.sites[site.site_id] = site # Build index for each PaleoData object and map file URL to dataTableID. # print(site.site_id) for paleo in site.paleo_data: self.data_table_index[paleo.datatable_id] = { 'study_id': study_obj.study_id, 'site_id': site.site_id, 'paleo_data': paleo } if paleo.file_url: self.file_url_to_datatable[paleo.file_url] = paleo.datatable_id
[docs] def get_summary_dataframe(self): """ Get a DataFrame summarizing all loaded studies. Returns ------- pandas.DataFrame A DataFrame with a summary of study metadata and components. """ data = [study.to_dict() for study in self.studies.values()] return pd.DataFrame(data)
[docs] def get_publications_dataframe(self): """ Get a DataFrame of all publications aggregated from the studies. Returns ------- pandas.DataFrame A DataFrame containing publication details with study context. """ publications_data = [] for study in self.studies.values(): for pub in study.publications: pub_dict = pub.to_dict() pub_dict['StudyID'] = study.study_id pub_dict['StudyName'] = study.metadata.get("studyName") publications_data.append(pub_dict) return pd.DataFrame(publications_data)
[docs] def get_sites_dataframe(self): """ Get a DataFrame of all sites aggregated from the studies, including paleo data. Returns ------- pandas.DataFrame A DataFrame containing site details with study context and paleo data. """ records = [] for study in self.studies.values(): study_id = study.study_id study_name = study.metadata.get("studyName") for site in study.sites: site_dict = site.to_dict() # Remove PaleoData from site_dict so it doesn't duplicate the paleo records. paleo_data = site_dict.pop('PaleoData', None) if paleo_data and isinstance(paleo_data, list) and len(paleo_data) > 0: # For each paleo record in the list, create a merged record. for paleo_record in paleo_data: # Merge site data and paleo record. If paleo_record contains an ID, # you might extract and set it as NOAADataTableId. record = {**site_dict, **paleo_record} record.update({ 'StudyID': study_id, 'StudyName': study_name, }) records.append(record) else: # If no paleo data is present, record the site as is. site_dict.update({ 'StudyID': study_id, 'StudyName': study_name }) records.append(site_dict) return pd.DataFrame(records)
[docs] def get_data_deprecated(self, dataTableIDs=None, file_urls=None): """ Fetch external data for given dataTableIDs or file URLs and attach study/site metadata. Parameters ---------- dataTableIDs : list or str, optional One or more NOAA data table IDs. file_urls : list or str, optional One or more file URLs. Returns ------- list of pandas.DataFrame A list of DataFrames, each corresponding to fetched data. """ if dataTableIDs: dataTableIDs = assert_list(dataTableIDs) dfs = [] for dt_id in dataTableIDs: mapping = self.data_table_index.get(dt_id) if not mapping: print(f"Data Table ID {dt_id} not found or no associated file URL.") continue file_url = mapping['paleo_data'].file_url if not file_url: print(f"No file URL for Data Table ID {dt_id}.") continue fetched_data = DataFetcher.fetch_data(file_url) if isinstance(fetched_data, list): for df in fetched_data: df.attrs['NOAADatasetId'] = mapping['study_id'] df.attrs['SiteID'] = mapping['site_id'] study_obj = self.studies.get(mapping['study_id'], {}) df.attrs['StudyName'] = study_obj.metadata.get("studyName") if hasattr(study_obj, 'metadata') else None publications = study_obj.publications if hasattr(study_obj, 'publications') else None print(len(publications)) for pub in publications: if hasattr(pub, "doi"): doi = pub.doi if pub.doi else None df.attrs['PublicationDOI'].append(doi) dfs.append(df) else: fetched_data.attrs['NOAADatasetId'] = mapping['study_id'] fetched_data.attrs['SiteID'] = mapping['site_id'] study_obj = self.studies.get(mapping['study_id'], {}) fetched_data.attrs['StudyName'] = study_obj.metadata.get("studyName") if hasattr(study_obj, 'metadata') else None dfs.append(fetched_data) return dfs if file_urls: file_urls = assert_list(file_urls) dfs = [DataFetcher.fetch_data(url) for url in file_urls] return dfs print("No dataTableID or file URL provided.") return pd.DataFrame()
def _process_file(self, file_url, mapping=None): """ Process a single file URL: validate the file type, parse the file, and attach metadata. Extended metadata now includes site details if available. Parameters ---------- file_url : str The URL of the file to process. mapping : dict, optional The mapping information containing study and site metadata. Returns ------- list of pandas.DataFrame A list of DataFrames parsed from the file. Raises ------ ValueError For proprietary or invalid file types, or missing file URL. Exception For any parsing errors encountered by StandardParser. """ if not file_url: raise ValueError("Faulty input: file URL is missing.") file_type = file_url.split('.')[-1].lower() if file_type in self._PROPRIETARY_TYPES: raise ValueError(f"File type '{file_type}' requires proprietary software for processing. " "Please use the appropriate software.") if file_type != 'txt': raise ValueError(f"Invalid file type '{file_type}'. Only .txt files are supported.") try: # print(file_url, type(file_url)) parsed_data = StandardParser(file_url).parse() except Exception as e: raise e def attach_metadata(df, mapping): # Attach study metadata. df.attrs['NOAADatasetId'] = mapping.get('study_id') study_obj = self.studies.get(mapping.get('study_id'), {}) df.attrs['StudyName'] = study_obj.metadata.get("studyName") if hasattr(study_obj, 'metadata') else None # Attach site metadata if available. # site_obj = self.sites.get(mapping.get('site_id')) # if site_obj: # df.attrs['SiteID'] = site_obj.site_id # df.attrs['SiteName'] = site_obj.site_name # df.attrs['LocationName'] = site_obj.location_name # df.attrs['Latitude'] = site_obj.lat # df.attrs['Longitude'] = site_obj.lon # df.attrs['MinElevation'] = site_obj.min_elevation # df.attrs['MaxElevation'] = site_obj.max_elevation # publications = study_obj.publications if hasattr(study_obj, 'publications') else None # print(len(publications)) # for pub in publications: # if hasattr(pub, "doi"): # doi = pub.doi if pub.doi else None # df.attrs['PublicationDOI'].append(doi) return df results = [] if isinstance(parsed_data, list): for df in parsed_data: if mapping: df = attach_metadata(df, mapping) results.append(df) else: if mapping: parsed_data = attach_metadata(parsed_data, mapping) results.append(parsed_data) return results
[docs] def get_data(self, dataTableIDs=None, file_urls=None): """ Fetch external data for given dataTableIDs or file URLs, perform validations, and attach study and site metadata. Parameters ---------- dataTableIDs : list or str, optional One or more NOAA data table IDs. file_urls : list or str, optional One or more file URLs. Returns ------- list of pandas.DataFrame A list of DataFrames corresponding to the fetched data. Raises ------ ValueError For missing parent study mapping, missing file URL, or proprietary/unsupported file types. Exception Propagates any exceptions raised by the parser. """ dfs = [] # Process based on dataTableIDs. if dataTableIDs: dataTableIDs = assert_list(dataTableIDs) for dt_id in dataTableIDs: # print(self.data_table_index, type(self.data_table_index.values())) # for id, value in self.data_table_index.items(): # print(type(id)) # print(value, type(value)) mapping = self.data_table_index.get(dt_id) if not mapping: raise ValueError(f"No parent study mapping found for Data Table ID '{dt_id}'. " "Please perform a search using this DataTableID or provide a specific file URL.") file_url = mapping['paleo_data'].file_url if not file_url: raise ValueError(f"File URL for Data Table ID '{dt_id}' is missing. Cannot fetch data.") dfs.extend(self._process_file(file_url, mapping)) return dfs # Process based on file_urls provided directly. if file_urls: file_urls = assert_list(file_urls) for url in file_urls: mapping = self.file_url_to_datatable.get(url) if not mapping: warnings.warn( f"Attached '{url}' is not linked to any parent study; additional metadata will not be attached.", UserWarning ) dfs.extend(self._process_file(url)) else: mapping_details = self.data_table_index.get(mapping) if not mapping_details: warnings.warn( f"Mapping details for file URL '{url}' (Data Table ID '{mapping}') not found; additional metadata will not be attached.", UserWarning ) dfs.extend(self._process_file(url)) else: dfs.extend(self._process_file(url, mapping_details)) return dfs raise ValueError("No dataTableID or file URL provided. Cannot fetch data.")