import torch
import numpy as np
import matplotlib.pyplot as plt

# Define the loss function f(x1, x2) = (x1 - 20)^2 / 20 + 2 * x2^2
def loss_function(x):
    return (x[0] - 20) ** 2 / 20 + 2 * x[1] ** 2

# Gradient of the loss function
def compute_gradient(x):
    grad_x1 = 2 * (x[0] - 20) / 20
    grad_x2 = 2 * 2 * x[1]
    return torch.tensor([grad_x1, grad_x2])

# Optimization settings
learning_rate = 0.48
iterations = 50

# Initial point
x_gd = torch.tensor([-5.0, -5.0])

# Store trajectory for plotting
gd_trajectory = [x_gd.numpy()]

# Perform optimization
for _ in range(iterations):
    grad_gd = compute_gradient(x_gd)
    x_gd = x_gd - learning_rate * grad_gd
    gd_trajectory.append(x_gd.numpy())

# Convert trajectory to NumPy for plotting
gd_trajectory = np.array(gd_trajectory)

# Create meshgrid for contour plot
x1_vals = np.linspace(-10, 40, 100)
x2_vals = np.linspace(-10, 10, 100)
X1, X2 = np.meshgrid(x1_vals, x2_vals)
Z = loss_function([X1, X2])

# Create figure and contour plot
fig, ax = plt.subplots(figsize=(8, 6))
contour = ax.contour(X1, X2, Z, levels=30, cmap='viridis')
ax.clabel(contour, inline=True, fontsize=8)

# Plot Gradient Descent trajectory
ax.plot(gd_trajectory[:, 0], gd_trajectory[:, 1], 'r-o', markersize=4, label="Gradient Descent")

# Labels and title
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.set_title('Gradient Descent Optimization Path')
#ax.legend()

# Save the plot as an image file
image_filename = "./img/gradient_descent.png"
plt.savefig(image_filename, dpi=300, bbox_inches='tight')

# Show the plot
plt.show()


import torch
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Define the loss function f(x1, x2) = (x1 - 10)^2 / 20 + x2^2
def loss_function(x):
    return (x[0] - 20) ** 2 / 20 + 2 * x[1] ** 2

# Gradient of the loss function
def compute_gradient(x):
    grad_x1 = 2 * (x[0] - 20) / 20
    grad_x2 = 2 * 2 * x[1] ** 1
    return torch.tensor([grad_x1, grad_x2])

# Optimization settings
learning_rate = 0.2
momentum = 0.9
iterations = 50

# Initial point (starting from a far-off place)
x_gd = torch.tensor([-5.0, -5.0])
x_gdm = torch.tensor([-5.0, -5.0])

# Lists to store trajectory for plotting
gd_trajectory = [x_gd.detach().numpy()]
gdm_trajectory = [x_gdm.detach().numpy()]

# Momentum velocity term
velocity = torch.tensor([0.0, 0.0])

# Perform optimization
for _ in range(iterations):
    # Compute gradients
    grad_gd = compute_gradient(x_gd)
    grad_gdm = compute_gradient(x_gdm)

    # Normal Gradient Descent Update
    x_gd = x_gd - learning_rate * grad_gd
    gd_trajectory.append(x_gd.numpy())

    # Gradient Descent with Momentum Update
    velocity = momentum * velocity - learning_rate * grad_gdm
    x_gdm = x_gdm + velocity
    gdm_trajectory.append(x_gdm.numpy())
    
    
# Convert trajectories to NumPy for plotting
gd_trajectory = np.array(gd_trajectory)
gdm_trajectory = np.array(gdm_trajectory)

# Create a meshgrid for 3D surface plot
x1_vals = np.linspace(-10, 40, 100)
x2_vals = np.linspace(-10, 10, 100)
X1, X2 = np.meshgrid(x1_vals, x2_vals)
Z = loss_function([X1,X2])

# Plot the function surface and optimization paths
fig = plt.figure(figsize=(12, 8))

# 3D surface plot
ax = fig.add_subplot(111, projection='3d')
ax.plot_surface(X1, X2, Z, cmap='viridis', alpha=0.7)

# Plot GD trajectory
ax.plot(gd_trajectory[:, 0], gd_trajectory[:, 1], color='red', marker='o', markersize=4, label='Gradient Descent')

# Plot GDM trajectory
ax.plot(gdm_trajectory[:, 0], gdm_trajectory[:, 1], color='blue', marker='o', markersize=4, label='Momentum Gradient Descent')

# Labels and legend
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.set_zlabel('Loss')
ax.set_title('Gradient Descent vs Momentum Gradient Descent')
ax.legend()

# 2D Contour plot projection
ax.contour(X1, X2, Z, levels=20, cmap='gray', zdir='z', offset=np.min(Z))

plt.show()


import torch
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.animation as animation

# Define the loss function f(x1, x2) = (x1 - 20)^2 / 20 + 2 * x2^2
def loss_function(x):
    return (x[0] - 20) ** 2 / 20 + 2 * x[1] ** 2

# Gradient of the loss function
def compute_gradient(x):
    grad_x1 = 2 * (x[0] - 20) / 20
    grad_x2 = 2 * 2 * x[1]
    return torch.tensor([grad_x1, grad_x2])

# Optimization settings
learning_rate = 0.5
momentum = 0.9
iterations = 50

# Initial points
x_gd = torch.tensor([-5.0, -5.0])
x_gdm = torch.tensor([-5.0, -5.0])

# Lists to store trajectory for plotting
gd_trajectory = [x_gd.numpy()]
gdm_trajectory = [x_gdm.numpy()]

# Momentum velocity term
velocity = torch.tensor([0.0, 0.0])

# Perform optimization
for _ in range(iterations):
    grad_gd = compute_gradient(x_gd)
    grad_gdm = compute_gradient(x_gdm)

    # Normal Gradient Descent Update
    x_gd = x_gd - learning_rate * grad_gd
    gd_trajectory.append(x_gd.numpy())

    # Gradient Descent with Momentum Update
    velocity = momentum * velocity - learning_rate * grad_gdm
    x_gdm = x_gdm + velocity
    gdm_trajectory.append(x_gdm.numpy())

# Convert trajectories to NumPy for plotting
gd_trajectory = np.array(gd_trajectory)
gdm_trajectory = np.array(gdm_trajectory)

# Create meshgrid for surface and contour plots
x1_vals = np.linspace(-10, 40, 100)
x2_vals = np.linspace(-10, 10, 100)
X1, X2 = np.meshgrid(x1_vals, x2_vals)
Z = loss_function([X1, X2])

# Create figure and subplots
fig = plt.figure(figsize=(12, 8))

# 3D surface plot
ax3d = fig.add_subplot(121, projection='3d')
ax3d.plot_surface(X1, X2, Z, cmap='viridis', alpha=0.7)

# 2D contour plot
ax2d = fig.add_subplot(122)
contour = ax2d.contour(X1, X2, Z, levels=30, cmap='viridis')
ax2d.clabel(contour, inline=True, fontsize=8)

# Initialize points for animation
gd_point_3d, = ax3d.plot([], [], [], 'ro', markersize=8, label="Gradient Descent")
gdm_point_3d, = ax3d.plot([], [], [], 'bo', markersize=8, label="Momentum GD")

gd_point_2d, = ax2d.plot([], [], 'ro', markersize=8)
gdm_point_2d, = ax2d.plot([], [], 'bo', markersize=8)

# Plot initial trajectories
gd_traj_2d, = ax2d.plot([], [], 'r-', alpha=0.5)
gdm_traj_2d, = ax2d.plot([], [], 'b-', alpha=0.5)

# Labels and legends
ax3d.set_xlabel('$x_1$')
ax3d.set_ylabel('$x_2$')
ax3d.set_zlabel('Loss')
ax3d.set_title('Gradient Descent vs Momentum (3D Surface)')
ax3d.legend()

ax2d.set_xlabel('$x_1$')
ax2d.set_ylabel('$x_2$')
ax2d.set_title('Optimization Path on Contour')

def update(frame):
    if frame < len(gd_trajectory):
        # Update ball positions (must pass lists or arrays, not scalars)
        gd_point_3d.set_data([gd_trajectory[frame, 0]], [gd_trajectory[frame, 1]])
        gd_point_3d.set_3d_properties([loss_function(gd_trajectory[frame])])

        gdm_point_3d.set_data([gdm_trajectory[frame, 0]], [gdm_trajectory[frame, 1]])
        gdm_point_3d.set_3d_properties([loss_function(gdm_trajectory[frame])])

        gd_point_2d.set_data([gd_trajectory[frame, 0]], [gd_trajectory[frame, 1]])
        gdm_point_2d.set_data([gdm_trajectory[frame, 0]], [gdm_trajectory[frame, 1]])

        # Update trajectory lines on contour plot
        gd_traj_2d.set_data(gd_trajectory[:frame+1, 0], gd_trajectory[:frame+1, 1])
        gdm_traj_2d.set_data(gdm_trajectory[:frame+1, 0], gdm_trajectory[:frame+1, 1])

    return gd_point_3d, gdm_point_3d, gd_point_2d, gdm_point_2d, gd_traj_2d, gdm_traj_2d

# Create and save animation
ani = animation.FuncAnimation(fig, update, frames=iterations, interval=150, blit=True)

# Save the animation as a GIF file
gif_filename = "./img/gradient_vs_momentum.gif"
ani.save(gif_filename, writer=animation.PillowWriter(fps=10))

print(f"Animation saved as {gif_filename}")

Animation saved as ./img/gradient_vs_momentum.gif

Lecture 6 - Primal optimization¶

ECE364 - Programming Methods for Machine Learning¶

Nickvash Kani¶

Slides based off prior lectures by Alex Schwing, Aigou Han, Farzas Kamalabadi, Corey Snyder. All mistakes are my own!¶

What's primal optimization?¶

Forming the Dual Problem¶

Returning back to our optimization problem¶

Convexity¶

How do we find the minima?¶

Momentum in gradient descent¶

Gradient descent with and without momentum ($\alpha=0.2$)¶

Gradient descent with and without momentum ($\alpha=0.5$)¶

That's it for today¶

Other References:¶