from scipy.stats import binom


NUM_EXPERIMENTS = 5
NUM_TOSSES = 10

# The results: the number of heads observed in each experiment
z = [7, 8, 3, 7, 2]

# Our parameters: (p0, p1, lambda)
theta = (0.45, 0.55, 0.5)

# Note that it's important to break symmetry when 
# setting up our initial guesses for parameters. 
#
# If we started with both p0 and p1 as the same
# values, the EM algorithm wouldn't be able to
# split them when applying its formulas (since 
# likelihoods for coin 0 and coin 1 would be 
# identical throughout) and even after updating
# we would never get different p0 and p1 values.
#
# Often in practice we will initialise our guesses
# to random values to avoid this issue.


def binomial_pmf(n, z, p):
    return binom.pmf(z, n, p)


def P_x_i_given_z_i_and_theta(x_i, z_i, theta):
    """ 
    P(x_i | z_i, theta)
    
    This is the probability of the i'th coin 
    being `x_i`, given the observation `z_i` 
    and parameters `theta`.
    
    This is really just a simple application of 
    Bayes' rule.
    """
    assert x_i in (0, 1)  # x_i indicates which coin was used
    
    numer = P_z_i_given_x_i_and_theta(z_i, x_i, theta) * P_x_i_given_theta(x_i, theta)
    denom = (
        P_z_i_given_x_i_and_theta(z_i, 0, theta) * P_x_i_given_theta(0, theta) +
        P_z_i_given_x_i_and_theta(z_i, 1, theta) * P_x_i_given_theta(1, theta)
    )
    p = numer / denom
    
    return p
    
    
def P_z_i_given_x_i_and_theta(z_i, x_i, theta):
    """ 
    P(z_i | x_i, theta)
    
    This is the probability of the i'th coin 
    showing `z_i` heads, given that it is coin 
    `x_i` and parameters `theta`.
    """
    assert x_i in (0, 1)  # x_i indicates which coin was used
    
    # x_i tells us which coin we're using
    # and therefore which probability to
    # pick out from our parameters vector.
    p0, p1, lamb = theta
    p = p0 if x_i == 0 else p1
    
    return binomial_pmf(NUM_TOSSES, z_i, p)
    
    
def P_x_i_given_theta(x_i, theta):
    """ 
    P(x_i | theta)
    
    This is the marginal probability of the i'th
    coin being coin `x_i` (0 or 1), without taking
    into account any observed data. So it is 
    determined by our current lambda value.
    """
    assert x_i in (0, 1)
    
    p0, p1, lamb = theta
    p = lamb if x_i == 0 else 1 - lamb
    
    return p


for t in range(10):
    print(f't={t}, theta=({theta[0]:.5f}, {theta[1]:.5f}, {theta[2]:.5f})')
    
    # Step one: calculate our weights.
    # These are based on the observations and the current
    # best estimate of theta.
    
    w_i_0_t = [
        P_x_i_given_z_i_and_theta(0, z[i], theta)
        for i in range(NUM_EXPERIMENTS)
    ]
    w_i_1_t = [
        P_x_i_given_z_i_and_theta(1, z[i], theta)
        for i in range(NUM_EXPERIMENTS)
    ]
    
    # Step two: apply our EM update formulas.
    
    # p0 = P(head when coin 0 is tossed)
    numer = sum(w_i_0_t[i] * z[i] for i in range(NUM_EXPERIMENTS))
    denom = sum(w_i_0_t[i] * NUM_TOSSES for i in range(NUM_EXPERIMENTS))
    new_p0 = numer / denom
    
    # p1 = P(head when coin 1 is tossed)
    numer = sum(w_i_1_t[i] * z[i] for i in range(NUM_EXPERIMENTS))
    denom = sum(w_i_1_t[i] * NUM_TOSSES for i in range(NUM_EXPERIMENTS))
    new_p1 = numer / denom
    
    # lambda = P(coin 0 chosen for each experiment)
    new_lambda = sum(w_i_0_t[i] for i in range(NUM_EXPERIMENTS)) / NUM_EXPERIMENTS
    
    # Update our parameters ready for the next iteration
    theta = (new_p0, new_p1, new_lambda)

print()
print(f't={t+1}, theta=({theta[0]:.5f}, {theta[1]:.5f}, {theta[2]:.5f})')

t=0, theta=(0.45000, 0.55000, 0.50000)
t=1, theta=(0.42385, 0.63970, 0.46189)
t=2, theta=(0.33388, 0.70400, 0.44309)
t=3, theta=(0.27241, 0.72693, 0.41127)
t=4, theta=(0.25603, 0.72888, 0.39945)
t=5, theta=(0.25376, 0.72870, 0.39732)
t=6, theta=(0.25349, 0.72862, 0.39698)
t=7, theta=(0.25345, 0.72860, 0.39693)
t=8, theta=(0.25345, 0.72859, 0.39692)
t=9, theta=(0.25345, 0.72859, 0.39692)

t=10, theta=(0.25345, 0.72859, 0.39692)

Example of the EM algorithm: estimating coin biases¶

Step 1: calculate $P(x | z, \theta^{(t)})$¶

Step 2: maximise $h_t(\theta)$¶

Seeing the formulas above in action¶

Discussion¶