First commit of stuff

This commit is contained in:
Eric Ma 2019-12-14 19:22:39 -05:00
parent 3e55a1d8d6
commit 57e5c7dd19
4 changed files with 331 additions and 0 deletions

1
Procfile Normal file
View File

@ -0,0 +1 @@
web: streamlit run --server.port 5000 --server.baseUrlPath minimal-streamlit.herokuapp.com beta_distribution.py

306
beta_distribution.py Normal file
View File

@ -0,0 +1,306 @@
import streamlit as st
from scipy.stats import beta, norm
import numpy as np
import matplotlib.pyplot as plt
st.header("Beta Distribution Tutorial")
st.write(
"""
The beta distribution describes a probability distribution
over values from the range (0, 1).
In our make believe protein engineering project,
we use the beta distribution to help us
with the estimation of a mutant's actual activity,
defined as a fraction from 0 to 1,
aggregated over multiple biological replicates (multiple colonies per mutant)
and multiple technical replicates
(replicate measurement of individual colonies).
Before we go on to what Bayesian estimation is,
please go ahead and change the values of the alpha and beta parameters
using the sliders on the left sidebar
(click the arrow on the left if it is closed).
"""
)
# These are going to be "globally" defined, because I intend to use them
# across multiple plots.
st.sidebar.markdown(
"""
# Control Panel
"""
)
alpha_slider = st.sidebar.slider(
"Value of alpha parameter",
min_value=0.1,
max_value=100.0,
step=1.0,
value=2.0,
)
beta_slider = st.sidebar.slider(
"Value of beta parameter",
min_value=0.1,
max_value=100.0,
step=1.0,
value=12.0,
)
def plot_dist(alpha_value: float, beta_value: float, data: np.ndarray = None):
beta_dist = beta(alpha_value, beta_value)
xs = np.linspace(0, 1, 1000)
ys = beta_dist.pdf(xs)
fig, ax = plt.subplots(figsize=(7, 3))
ax.plot(xs, ys)
ax.set_xlim(0, 1)
ax.set_xlabel("x")
ax.set_ylabel("P(x)")
if data is not None:
likelihoods = beta_dist.pdf(data)
sum_log_likelihoods = np.sum(beta_dist.logpdf(data))
ax.vlines(data, ymin=0, ymax=likelihoods)
ax.scatter(data, likelihoods, color="black")
st.write(
f"""
_Under your alpha={alpha_slider:.2f} and beta={beta_slider:.2f},
the sum of log likelihoods is {sum_log_likelihoods:.2f}_
"""
)
st.pyplot(fig)
plot_dist(alpha_slider, beta_slider)
st.subheader("What are the alpha and beta parameters?")
st.write(
"""
The alpha and beta control the shape of the beta distribution.
Their colloquial interpretation corresponds to
the coin flip, roughly "number of successes" (alpha)
and "number of failures" (beta).
The only difference here is that alpha and beta can be non-integer values,
i.e. having decimal places.
"""
)
st.subheader("How do we use it?")
st.write(
"""
The beta distribution is useful for modelling quantities
that can only take on values between 0 and 1.
Our conversion ratio is an example of this.
"""
)
st.header("Bayesian Estimation of Fractions using Beta-Distribution")
st.write(
"""
The core activity that we are trying to do
is to obtain the parameters of the alpha and beta distribution
that best explain the data that we observe.
Let us assume that we made three observations of activity for a mutant:
"""
)
with st.echo():
data = [0.83, 0.86, 0.91]
st.write(
"""
Can you find beta distribution parameters
that best explain these three data points?
The best ratio of alpha to beta is probably around 1:6.
However, is it 1:6, or is it 6:36, or is it 15:90?
Play around with different ratios to see which one maximizes the log likelihood.
"""
)
plot_dist(alpha_slider, beta_slider, data)
st.write(
"""
Let's try this out with another dataset.
Which parameter values should best explain these values?
(Hint, it's approximately a ratio of alpha:beta ~ 1:1)
"""
)
with st.echo():
data = [0.53, 0.56, 0.51]
plot_dist(alpha_slider, beta_slider, data)
st.write(
"""
As you might have noticed,
the greater the magnitude of the value of your alpha and beta parameters,
the tighter the distribution of the beta distribution.
Let's take a look at it in the following chart.
"""
)
alpha_value = st.radio(label="Select a value for alpha", options=(2, 5))
beta_value = st.radio(label="Select a value for beta", options=(2, 5))
beta_dist_1 = beta(alpha_value, beta_value)
beta_dist_2 = beta(alpha_value * 5, beta_value * 5)
beta_dist_3 = beta(alpha_value * 10, beta_value * 10)
xs = np.linspace(0, 1, 1000)
ys_1 = beta_dist_1.pdf(xs)
ys_2 = beta_dist_2.pdf(xs)
ys_3 = beta_dist_3.pdf(xs)
fig4 = plt.figure(figsize=(7, 3))
plt.plot(xs, ys_1, label=f"alpha={alpha_value}, beta={beta_value}")
plt.plot(xs, ys_2, label=f"alpha={alpha_value * 5}, beta={beta_value * 5}")
plt.plot(xs, ys_3, label=f"alpha={alpha_value * 10}, beta={beta_value * 10}")
plt.ylabel("P(x)")
plt.legend()
st.pyplot(fig4)
st.write(
"""
As you can see, the width of the curve decreases
as the scale of the alpha and beta parameters increases.
_We become less uncertain._
"""
)
st.write(
"""
Now, if we focus our attention on
the variance of the beta distribution when operating in the same scale
(i.e. fix the sum of alpha and beta),
let's see what happens to the variance of the likelihood distribution.
Let's assume that alpha + beta = 50,
and we will have you adjust the value of alpha.
"""
)
max_val = 50
alpha_value = st.slider("alpha", 1, max_val - 1)
beta_value = max_val - alpha_value
beta_dist = beta(alpha_value, beta_value)
xs = np.linspace(0, 1, 1000)
ys = beta_dist.pdf(xs)
fig = plt.figure(figsize=(7, 3))
plt.plot(xs, ys, label=f"alpha={alpha_value}, beta={beta_value}")
plt.title(f"StDev: {beta_dist.std():.3f}")
plt.ylabel("P(x)")
plt.legend()
st.pyplot(fig)
st.write(
"""
As you can see, as we go more to the extremes, the variance decreases.
A more comprehensive look at variance as a function of alpha and beta,
keeping the "scale" (i.e. sum of alpha and beta) the same is below.
"""
)
alpha_values = np.arange(1, 50)
beta_dists = beta(alpha_values, 50 - alpha_values)
ys = beta_dists.std()
fig = plt.figure(figsize=(7, 4))
plt.scatter(alpha_values, ys)
plt.xlabel("alpha")
plt.title("standard deviation of the beta distribution as a function of alpha")
plt.ylabel("std")
st.pyplot()
st.header("Conclusions")
st.write(
"""
What we have found thus far is that
according to the math of the beta distribution,
at a given scale, variance is always going to be highest in the middle,
where the peak variance consistently is close to the middle
of the beta distribution likelihood.
The more noisy the calculated ratios are,
the smaller the alpha + beta scale needed to maximize their likelihoods,
and hence the greater their estimated likelihood distribution variance will be.
When we do Bayesian estimation,
we are estimating the parameters alpha and beta
that explain both the central tendency measures
of our measured ratios (i.e. mean)
and the variance in the measured ratios.
Ratios that are more variably distributed, such as `[0.81, 0.56, 0.93]`,
will naturally get smaller estimated alphas and betas
to explain the high variance in the ratios.
(They will also not tend to be centered around the extremes,
which also makes sense: to be generate data centered around the extremes,
we need extremely tightly-distributed data.)
Ratios that are more tightly distributed, such as `[0.51, 0.49, 0.48]`,
will naturally get higher estimated alphas and betas.
This is the spirit of what we are doing in the Bayesian estimation model.
In short, there's both math + measurement contributing to the "wave" phenomena.
"""
)
st.header("Try with your own data!")
st.write(
"""
I'd like to invite you to upload your own data.
Type in numbers into the text box below, separated by commas.
"""
)
data = st.text_input("Your data")
def process_data(data):
# One thing nice about streamlit is it allows us to surface informative
# errors to our end-users using built-in Python error handling.
# This is done by capturing the output of stdout and surfacing it to HTML.
try:
data = np.array([float(i) for i in data.split(", ")])
except ValueError as e:
raise ValueError("The data that you input must be castable as floats!")
if not (np.all(data > 0) and np.all(data < 1)):
raise ValueError("Your input data must be 0 < x < 1.")
return data
plot_dist(alpha_slider, beta_slider, process_data(data))
st.header("Congratulations!")
st.write("Click the button below once you're done with this tutorial.")
if st.button("CLICK ME!"):
st.balloons()
st.sidebar.markdown(
"""
# Thank you!
Did you like this mini-tutorial?
If you did, please give it a star on [GitHub](https://github.com/ericmjl/minimal-streamlit-example).
This was hand-crafted using streamlit in under 3 hours.
Created by [Eric J. Ma](https://ericmjl.github.io).
"""
)

20
environment.yml Normal file
View File

@ -0,0 +1,20 @@
name: minimal-streamlit-example
channels:
- defaults
- conda-forge
- ericmjl
dependencies:
- python=3.8
- conda
- scikit-learn
- scipy
- pandas
- numpy
- matplotlib
- pip
- pylint
- pydocstyle
- flake8
- black
- pip:
- streamlit

4
requirements.txt Normal file
View File

@ -0,0 +1,4 @@
numpy
matplotlib
streamlit
scipy