The Quick Version

Neural style transfer takes the content of one image (your photo) and renders it in the style of another (a painting). It works by optimizing a new image to match the content features of your photo and the texture/style features of the artwork, both extracted from a pretrained CNN.

1
pip install torch torchvision pillow
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_image(path: str, size: int = 512) -> torch.Tensor:
    transform = transforms.Compose([
        transforms.Resize((size, size)),
        transforms.ToTensor(),
    ])
    image = Image.open(path).convert("RGB")
    return transform(image).unsqueeze(0).to(device)

def save_image(tensor: torch.Tensor, path: str):
    image = tensor.squeeze(0).cpu().clamp(0, 1)
    transforms.ToPILImage()(image).save(path)

# Load content and style images
content_img = load_image("photo.jpg")
style_img = load_image("starry_night.jpg")

print(f"Content: {content_img.shape}, Style: {style_img.shape}")

The Style Transfer Algorithm

The core idea: extract features from intermediate layers of VGG-19. Early layers capture textures and colors (style), while deeper layers capture shapes and objects (content). Optimize a generated image to match both.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# Load pretrained VGG-19 features
vgg = models.vgg19(weights=models.VGG19_Weights.IMAGENET1K_V1).features.to(device).eval()

# Freeze VGG weights — we're optimizing the image, not the network
for param in vgg.parameters():
    param.requires_grad_(False)

# Normalization to match VGG training
normalize = transforms.Normalize(
    mean=[0.485, 0.456, 0.406],
    std=[0.229, 0.224, 0.225],
)

def get_features(image: torch.Tensor, model: nn.Module) -> dict:
    """Extract features from specific VGG layers."""
    layers = {
        "0": "conv1_1",
        "5": "conv2_1",
        "10": "conv3_1",
        "19": "conv4_1",
        "21": "conv4_2",  # content layer
        "28": "conv5_1",
    }
    features = {}
    x = normalize(image.squeeze(0)).unsqueeze(0)

    for name, layer in model._modules.items():
        x = layer(x)
        if name in layers:
            features[layers[name]] = x
    return features

def gram_matrix(tensor: torch.Tensor) -> torch.Tensor:
    """Compute Gram matrix for style representation."""
    b, c, h, w = tensor.shape
    features = tensor.view(b * c, h * w)
    gram = torch.mm(features, features.t())
    return gram / (c * h * w)

Running the Optimization

The generated image starts as a copy of the content image and gets iteratively modified to match the style:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def style_transfer(
    content_img: torch.Tensor,
    style_img: torch.Tensor,
    steps: int = 300,
    style_weight: float = 1e6,
    content_weight: float = 1.0,
) -> torch.Tensor:
    """Run neural style transfer optimization."""

    # Extract target features
    content_features = get_features(content_img, vgg)
    style_features = get_features(style_img, vgg)

    # Compute style Gram matrices
    style_grams = {
        layer: gram_matrix(style_features[layer])
        for layer in ["conv1_1", "conv2_1", "conv3_1", "conv4_1", "conv5_1"]
    }

    # Start from content image (converges faster than random noise)
    generated = content_img.clone().requires_grad_(True)
    optimizer = optim.LBFGS([generated], max_iter=1)

    style_layers = ["conv1_1", "conv2_1", "conv3_1", "conv4_1", "conv5_1"]
    style_layer_weights = {
        "conv1_1": 1.0,
        "conv2_1": 0.8,
        "conv3_1": 0.5,
        "conv4_1": 0.3,
        "conv5_1": 0.1,
    }

    for step in range(steps):
        def closure():
            generated.data.clamp_(0, 1)
            optimizer.zero_grad()
            gen_features = get_features(generated, vgg)

            # Content loss — match content layer features
            content_loss = nn.functional.mse_loss(
                gen_features["conv4_2"], content_features["conv4_2"]
            )

            # Style loss — match Gram matrices across style layers
            style_loss = 0
            for layer in style_layers:
                gen_gram = gram_matrix(gen_features[layer])
                style_gram = style_grams[layer]
                style_loss += style_layer_weights[layer] * nn.functional.mse_loss(
                    gen_gram, style_gram
                )

            total_loss = content_weight * content_loss + style_weight * style_loss
            total_loss.backward()
            return total_loss

        optimizer.step(closure)

        if (step + 1) % 50 == 0:
            with torch.no_grad():
                gen_features = get_features(generated, vgg)
                c_loss = nn.functional.mse_loss(
                    gen_features["conv4_2"], content_features["conv4_2"]
                )
                print(f"Step {step+1}/{steps}, Content loss: {c_loss.item():.4f}")

    generated.data.clamp_(0, 1)
    return generated

result = style_transfer(content_img, style_img, steps=300)
save_image(result, "styled_output.png")
print("Saved styled_output.png")

Controlling Style Strength

The style_weight parameter controls how strongly the style is applied. Higher values give more artistic effect but can obscure the original content.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
# Subtle style — photo is clearly recognizable
result_subtle = style_transfer(content_img, style_img, style_weight=1e4)
save_image(result_subtle, "subtle.png")

# Medium style — balanced
result_medium = style_transfer(content_img, style_img, style_weight=1e6)
save_image(result_medium, "medium.png")

# Heavy style — painting dominates
result_heavy = style_transfer(content_img, style_img, style_weight=1e8)
save_image(result_heavy, "heavy.png")
Style WeightEffectBest For
1e3 - 1e4Subtle texture overlayPhoto filters, light effects
1e5 - 1e6Balanced transferGeneral artistic rendering
1e7 - 1e8Heavy stylizationAbstract art, creative projects

Fast Style Transfer with a Trained Network

The optimization approach above takes 30-60 seconds per image. For real-time applications, use a feed-forward network that’s trained once and then applies style instantly:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from torchvision.models import vgg16
import torch.nn as nn

class TransformerNet(nn.Module):
    """Feed-forward style transfer network (Johnson et al.)"""
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            # Downsampling
            nn.ReflectionPad2d(4),
            nn.Conv2d(3, 32, 9, stride=1), nn.InstanceNorm2d(32), nn.ReLU(),
            nn.Conv2d(32, 64, 3, stride=2, padding=1), nn.InstanceNorm2d(64), nn.ReLU(),
            nn.Conv2d(64, 128, 3, stride=2, padding=1), nn.InstanceNorm2d(128), nn.ReLU(),
            # Residual blocks (5 blocks for quality)
            *[ResidualBlock(128) for _ in range(5)],
            # Upsampling
            nn.ConvTranspose2d(128, 64, 3, stride=2, padding=1, output_padding=1),
            nn.InstanceNorm2d(64), nn.ReLU(),
            nn.ConvTranspose2d(64, 32, 3, stride=2, padding=1, output_padding=1),
            nn.InstanceNorm2d(32), nn.ReLU(),
            nn.ReflectionPad2d(4),
            nn.Conv2d(32, 3, 9, stride=1),
            nn.Sigmoid(),
        )

    def forward(self, x):
        return self.model(x)

class ResidualBlock(nn.Module):
    def __init__(self, channels: int):
        super().__init__()
        self.block = nn.Sequential(
            nn.ReflectionPad2d(1),
            nn.Conv2d(channels, channels, 3), nn.InstanceNorm2d(channels), nn.ReLU(),
            nn.ReflectionPad2d(1),
            nn.Conv2d(channels, channels, 3), nn.InstanceNorm2d(channels),
        )

    def forward(self, x):
        return x + self.block(x)

# After training, inference is instant:
# styled = transfer_net(content_image)  # ~20ms on GPU

Train this network once per style (takes 2-4 hours on a single GPU with the COCO dataset), then apply it to any image in milliseconds. This is how mobile apps like Prisma work.

Common Errors and Fixes

Output looks like a blurry mess

style_weight is too high relative to content_weight. Start with style_weight=1e5 and increase gradually. Also check that both images are the same resolution — mismatched sizes cause feature alignment issues.

CUDA out of memory

Reduce image size from 512 to 256. Style transfer’s memory usage scales quadratically with image dimensions because of the Gram matrix computation. For high-res output, generate at 256 and upscale with a super-resolution model.

Style doesn’t transfer evenly across the image

Some image regions have weak features that don’t constrain the style well. Add total variation loss to smooth the output:

1
2
3
4
def total_variation_loss(img: torch.Tensor) -> torch.Tensor:
    h_diff = (img[:, :, 1:, :] - img[:, :, :-1, :]).pow(2).mean()
    w_diff = (img[:, :, :, 1:] - img[:, :, :, :-1]).pow(2).mean()
    return h_diff + w_diff

Add tv_weight * total_variation_loss(generated) to your total loss. Use tv_weight=1e-4 to start.

Optimization doesn’t converge

Switch from LBFGS to Adam with lr=0.01. Adam is slower to converge but more stable. Also ensure your images are normalized to [0, 1] range before optimization.