diff --git a/docs/learning/notebooks/some-xarray-pandas-presentation_Sara.ipynb b/docs/learning/notebooks/some-xarray-pandas-presentation_Sara.ipynb index e68f402..05db541 100644 --- a/docs/learning/notebooks/some-xarray-pandas-presentation_Sara.ipynb +++ b/docs/learning/notebooks/some-xarray-pandas-presentation_Sara.ipynb @@ -10,17 +10,11 @@ "\n", "- We have massively different levels here\n", "- Try to make some aims for technical skills you can learn!\n", - "- If you are beginning with python --> learn the basics\n", - "- If you are good at basic python --> learn new packages\n", - "- If you know all the packages --> improve your skills with producing your own software etc. \n", - "- If you don't know git and github --> get better at this!\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "print('Hey world')" + "- If you are beginning with python → learn the basics\n", + "- If you are good at basic python → learn new packages and tricks \n", + "- If you know all the packages → improve your skills with producing your own software, organising your code etc. \n", + "- If you don't know git and github → get better at this!\n", + "- **Learn from each other!**" ] }, { @@ -58,8 +52,8 @@ }, "source": [ "## What are pandas and xarray?\n", - "- Pandas --> like a spreadsheet 2D data with columns and rows\n", - "- xarray --> like pandas, but in N dimensions\n", + "- Pandas → like a spreadsheet 2D data with columns and rows\n", + "- xarray → like pandas, but in N dimensions\n", "- **Use the functionality these packages gives you! Will help you avoid mistakes. Try to get as good as possible :)**\n" ] }, @@ -74,7 +68,6 @@ }, "cell_type": "markdown", "metadata": { - "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ @@ -369,8 +362,12 @@ } ], "source": [ - "cat = col.search(source_id=['CESM2'], experiment_id=['historical'], table_id=['Amon','fx','AERmon'], \n", - " variable_id=['tas','hurs', 'areacella','mmrso4' ], member_id=['r1i1p1f1'])\n", + "cat = col.search(source_id = ['CESM2'], \n", + " experiment_id=['historical'], \n", + " table_id=['Amon','fx','AERmon'], \n", + " variable_id=['tas','hurs', 'areacella','mmrso4' ], \n", + " member_id=['r1i1p1f1'],\n", + " )\n", "cat.df\n" ] }, @@ -382,7 +379,7 @@ { "data": { "text/plain": [ - "AggregationControl(variable_column_name='variable_id', groupby_attrs=['activity_id', 'experiment_id', 'source_id', 'table_id', 'grid_label'], aggregations=[Aggregation(type=, attribute_name='variable_id', options={}), Aggregation(type=, attribute_name='time_range', options={'dim': 'time', 'coords': 'minimal', 'compat': 'override'}), Aggregation(type=, attribute_name='member_id', options={'coords': 'minimal', 'compat': 'override'})])" + "['activity_id', 'experiment_id', 'source_id', 'table_id', 'grid_label']" ] }, "execution_count": 4, @@ -392,7 +389,7 @@ ], "source": [ "cat.esmcat.aggregation_control.groupby_attrs = ['activity_id','experiment_id', 'source_id','table_id','grid_label']\n", - "cat.esmcat.aggregation_control#['groupby_attrs']" + "cat.esmcat.aggregation_control.groupby_attrs" ] }, { @@ -471,9 +468,9 @@ " var = coder.decode(var, name=name)\n", "/opt/conda/envs/pangeo-notebook/lib/python3.11/site-packages/xarray/conventions.py:286: SerializationWarning: variable 'hurs' has multiple fill values {1e+20, 1e+20} defined, decoding all values to NaN.\n", " var = coder.decode(var, name=name)\n", - "/opt/conda/envs/pangeo-notebook/lib/python3.11/site-packages/xarray/conventions.py:286: SerializationWarning: variable 'tas' has multiple fill values {1e+20, 1e+20} defined, decoding all values to NaN.\n", - " var = coder.decode(var, name=name)\n", "/opt/conda/envs/pangeo-notebook/lib/python3.11/site-packages/xarray/conventions.py:286: SerializationWarning: variable 'mmrso4' has multiple fill values {1e+20, 1e+20} defined, decoding all values to NaN.\n", + " var = coder.decode(var, name=name)\n", + "/opt/conda/envs/pangeo-notebook/lib/python3.11/site-packages/xarray/conventions.py:286: SerializationWarning: variable 'tas' has multiple fill values {1e+20, 1e+20} defined, decoding all values to NaN.\n", " var = coder.decode(var, name=name)\n" ] } @@ -506,6 +503,53 @@ "list(dset_dict.keys())" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "Since I have already checked that these datasets are on the same grid, we can merge them:\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_14081/179335234.py:6: DeprecationWarning: dropping variables using `drop` is deprecated; use drop_vars.\n", + " ds= ds.drop(v)\n" + ] + } + ], + "source": [ + "ds_list =[]\n", + "for k in dset_dict.keys():\n", + " ds = dset_dict[k]\n", + " for v in ['lon_bnds', 'lat_bnds', 'time_bnds']:\n", + " if v in ds:\n", + " ds= ds.drop(v)\n", + " ds_list.append(ds) \n", + "ds = xr.merge(ds_list,compat='override')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## 1.1 Reading in the data from file:" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -518,16 +562,86 @@ ] }, { - "cell_type": "code", - "execution_count": 7, + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "```python\n", + "path='filename.nc'\n", + "ds = xr.open_dataset(path)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "##### Opening multiple files:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "```python\n", + "\n", + "list_of_files = [\n", + " 'file1.nc',\n", + " 'file2.nc'\n", + "]\n", + "xr.open_mfdataset(list_of_files, concat_dim='time',combine='by_coords')\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## 2. Check how your dataset looks" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "\n", + "NetCDF + xarray = <3\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "NetCDF (Network Common Data Form) is a machine-independent data format (and software) that support the creation, access, and sharing of array-oriented scientific data. It was originally developed by UCAR (University Corporation for Atmospheric Research), and it's widely used in the atmospheric and oceanographic sciences, as well as in other fields such as Earth sciences, geophysics, and climatology.\n", + "\n", + "**What is really great is that it keeps a lot of metadata (see below)**\n", + "\n", + "Xarray is a Python library designed to work with multi-dimensional arrays and datasets, particularly those used in earth sciences, climate science, and atmospheric science. It builds upon and extends the functionality of **NumPy, Pandas, and NetCDF**, providing a high-level interface for working with **labeled, multi-dimensional data**." + ] + }, + { + "cell_type": "markdown", "metadata": { "tags": [] }, - "outputs": [], "source": [ - "ds1 = dset_dict['CMIP.historical.CESM2.Amon.gn']\n", - "ds2 = dset_dict['CMIP.historical.CESM2.fx.gn']\n", - "ds3 = dset_dict['CMIP.historical.CESM2.AERmon.gn']" + "#### Different types of information/data:\n", + "- Coordinates\n", + "- Data variables\n", + "- Global attributes\n", + "- Variable attributes\n", + "- Other? " ] }, { @@ -903,17 +1017,27 @@ " stroke: currentColor;\n", " fill: currentColor;\n", "}\n", - "
<xarray.Dataset> Size: 229kB\n",
-       "Dimensions:    (member_id: 1, lat: 192, lon: 288, nbnd: 2)\n",
+       "
<xarray.Dataset> Size: 15GB\n",
+       "Dimensions:    (member_id: 1, lat: 192, lon: 288, time: 1980, lev: 32, nbnd: 2)\n",
        "Coordinates:\n",
        "  * lat        (lat) float64 2kB -90.0 -89.06 -88.12 -87.17 ... 88.12 89.06 90.0\n",
        "  * lon        (lon) float64 2kB 0.0 1.25 2.5 3.75 ... 355.0 356.2 357.5 358.8\n",
        "  * member_id  (member_id) object 8B 'r1i1p1f1'\n",
+       "  * lev        (lev) float64 256B -3.643 -7.595 -14.36 ... -957.5 -976.3 -992.6\n",
+       "  * time       (time) object 16kB 1850-01-15 12:00:00 ... 2014-12-15 12:00:00\n",
        "Dimensions without coordinates: nbnd\n",
        "Data variables:\n",
        "    areacella  (member_id, lat, lon) float32 221kB dask.array<chunksize=(1, 192, 288), meta=np.ndarray>\n",
-       "    lat_bnds   (lat, nbnd) float32 2kB dask.array<chunksize=(192, 2), meta=np.ndarray>\n",
-       "    lon_bnds   (lon, nbnd) float32 2kB dask.array<chunksize=(288, 2), meta=np.ndarray>\n",
+       "    mmrso4     (member_id, time, lev, lat, lon) float32 14GB dask.array<chunksize=(1, 1, 16, 96, 144), meta=np.ndarray>\n",
+       "    ps         (time, lat, lon) float32 438MB dask.array<chunksize=(1, 192, 288), meta=np.ndarray>\n",
+       "    p0         float32 4B ...\n",
+       "    a          (lev) float64 256B dask.array<chunksize=(32,), meta=np.ndarray>\n",
+       "    b          (lev) float64 256B dask.array<chunksize=(32,), meta=np.ndarray>\n",
+       "    b_bnds     (lev, nbnd) float64 512B dask.array<chunksize=(32, 2), meta=np.ndarray>\n",
+       "    lev_bnds   (lev, nbnd) float64 512B dask.array<chunksize=(32, 2), meta=np.ndarray>\n",
+       "    a_bnds     (lev, nbnd) float64 512B dask.array<chunksize=(32, 2), meta=np.ndarray>\n",
+       "    hurs       (member_id, time, lat, lon) float32 438MB dask.array<chunksize=(1, 1, 192, 288), meta=np.ndarray>\n",
+       "    tas        (member_id, time, lat, lon) float32 438MB dask.array<chunksize=(1, 1, 192, 288), meta=np.ndarray>\n",
        "Attributes: (12/57)\n",
        "    Conventions:                      CF-1.7 CMIP-6.2\n",
        "    activity_id:                      CMIP\n",
@@ -927,7 +1051,7 @@
        "    intake_esm_attrs:version:         v20190308\n",
        "    intake_esm_attrs:path:            /mnt/craas1-ns9989k-ns9560k/ESGF/CMIP6/...\n",
        "    intake_esm_attrs:_data_format_:   netcdf\n",
-       "    intake_esm_dataset_key:           CMIP.historical.CESM2.fx.gn