From dfe5a991ee1ff0df048a42f76a4d92641e5b98d5 Mon Sep 17 00:00:00 2001
From: Krupakar Reddy <137398727+Krupakar-Reddy-S@users.noreply.github.com>
Date: Mon, 12 Aug 2024 11:46:25 +0530
Subject: [PATCH] Added example from README (#98)

---
 welcome.md | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 121 insertions(+)

diff --git a/welcome.md b/welcome.md
index c0dfb91..ecae3b9 100644
--- a/welcome.md
+++ b/welcome.md
@@ -19,6 +19,127 @@ Here is what sets it apart:
 * **Batteries included**: Includes probability distributions, Gaussian processes, ABC, SMC and much more. It integrates nicely with {doc}`ArviZ <arviz:index>` for visualizations and diagnostics, as well as {doc}`Bambi <bambi:index>` for high-level mixed-effect models.
 * **Community focused**: Ask questions on [discourse](https://discourse.pymc.io), join [MeetUp events](https://meetup.com/pymc-online-meetup/), follow us on [Twitter](https://twitter.com/pymc_devs), and start [contributing](https://www.pymc.io/projects/docs/en/latest/contributing/index.html).
 
+## Example from Linear Regression
+
+This example demonstrates how to perform Bayesian inference for a linear regression model to predict plant growth based on environmental factors.
+
+Plant growth can be influenced by multiple factors, and understanding these relationships is crucial for optimizing agricultural practices.
+
+```python
+import pymc as pm
+
+# Taking draws from a normal distribution
+seed = 42
+x_dist = pm.Normal.dist(shape=(100, 3))
+x_data = pm.draw(x_dist, random_seed=seed)
+
+# Independent Variables:
+# Sunlight Hours: Number of hours the plant is exposed to sunlight daily.
+# Water Amount: Daily water amount given to the plant (in milliliters).
+# Soil Nitrogen Content: Percentage of nitrogen content in the soil.
+
+
+# Dependent Variable:
+# Plant Growth (y): Measured as the increase in plant height (in centimeters) over a certain period.
+
+
+# Define coordinate values for all dimensions of the data
+coords={
+ "trial": range(100),
+ "features": ["sunlight hours", "water amount", "soil nitrogen"],
+}
+
+# Define generative model
+with pm.Model(coords=coords) as generative_model:
+   x = pm.Data("x", x_data, dims=["trial", "features"])
+
+   # Model parameters
+   betas = pm.Normal("betas", dims="features")
+   sigma = pm.HalfNormal("sigma")
+
+   # Linear model
+   mu = x @ betas
+
+   # Likelihood
+   # Assuming we measure deviation of each plant from baseline
+   plant_growth = pm.Normal("plant growth", mu, sigma, dims="trial")
+
+
+# Generating data from model by fixing parameters
+fixed_parameters = {
+ "betas": [5, 20, 2],
+ "sigma": 0.5,
+}
+with pm.do(generative_model, fixed_parameters) as synthetic_model:
+   idata = pm.sample_prior_predictive(random_seed=seed) # Sample from prior predictive distribution.
+   synthetic_y = idata.prior["plant growth"].sel(draw=0, chain=0)
+
+
+# Infer parameters conditioned on observed data
+with pm.observe(generative_model, {"plant growth": synthetic_y}) as inference_model:
+   idata = pm.sample(random_seed=seed)
+
+   summary = pm.stats.summary(idata, var_names=["betas", "sigma"])
+   print(summary)
+```
+From the summary, we can see that the mean of the inferred parameters are very close to the fixed parameters
+
+| Params                  | mean  |  sd  | hdi_3% | hdi_97% | mcse_mean | mcse_sd | ess_bulk | ess_tail | r_hat |
+|-------------------------|-------|------|--------|---------|-----------|---------|----------|----------|-------|
+| betas[sunlight hours]   | 4.972 | 0.054 | 4.866 | 5.066 | 0.001 | 0.001 | 3003 | 1257 | 1 |
+| betas[water amount]     | 19.963 | 0.051 | 19.872 | 20.062 | 0.001 | 0.001 | 3112 | 1658 | 1 |
+| betas[soil nitrogen]    | 1.994 | 0.055 | 1.899 | 2.107 | 0.001 | 0.001 | 3221 | 1559 | 1 |
+| sigma                   | 0.511 | 0.037 | 0.438 | 0.575 | 0.001 | 0 | 2945 | 1522 | 1 |
+
+```python
+# Simulate new data conditioned on inferred parameters
+new_x_data = pm.draw(
+    pm.Normal.dist(shape=(3, 3)),
+    random_seed=seed,
+)
+new_coords = coords | {"trial": [0, 1, 2]}
+
+with inference_model:
+    pm.set_data({"x": new_x_data}, coords=new_coords)
+    pm.sample_posterior_predictive(
+        idata,
+        predictions=True,
+        extend_inferencedata=True,
+        random_seed=seed,
+    )
+
+pm.stats.summary(idata.predictions, kind="stats")
+```
+The new data conditioned on inferred parameters would look like:
+
+| Output            | mean  |  sd  | hdi_3% | hdi_97% |
+|-------------------|-------|------|--------|---------|
+| plant growth[0]   | 14.229 | 0.515 | 13.325 | 15.272 |
+| plant growth[1]   | 24.418 | 0.511 | 23.428 | 25.326 |
+| plant growth[2]   | -6.747 | 0.511 | -7.740 | -5.797 |
+
+```python
+# Simulate new data, under a scenario where the first beta is zero
+with pm.do(
+    inference_model,
+    {inference_model["betas"]: inference_model["betas"] * [0, 1, 1]},
+) as plant_growth_model:
+    new_predictions = pm.sample_posterior_predictive(
+        idata,
+        predictions=True,
+        random_seed=seed,
+    )
+
+pm.stats.summary(new_predictions, kind="stats")
+```
+The new data, under the above scenario would look like:
+
+| Output            | mean  |  sd  | hdi_3% | hdi_97% |
+|-------------------|-------|------|--------|---------|
+| plant growth[0]   | 12.149 | 0.515 | 11.193 | 13.135 |
+| plant growth[1]   | 29.809 | 0.508 | 28.832 | 30.717 |
+| plant growth[2]   | -0.131 | 0.507 | -1.121 | 0.791 |
+
 ## Get started
 * [Installation instructions](https://www.pymc.io/projects/docs/en/latest/installation.html)
 * [Beginner guide (if you **do not** know Bayesian modeling)](https://www.pymc.io/projects/docs/en/latest/learn/core_notebooks/pymc_overview.html)