The Quick Version

AI super resolution upscales images by 2-4x while adding sharp detail that traditional interpolation (bicubic, Lanczos) can’t. Real-ESRGAN is the go-to model — it handles photos, illustrations, and compressed images with minimal artifacts.

1
pip install realesrgan torch torchvision pillow
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
from realesrgan import RealESRGANer
from basicsr.archs.rrdbnet_arch import RRDBNet
from PIL import Image
import numpy as np
import torch

# Load the Real-ESRGAN model (4x upscaling)
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
upsampler = RealESRGANer(
    scale=4,
    model_path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth",
    model=model,
    tile=0,           # 0 for no tiling, set higher for low VRAM
    tile_pad=10,
    pre_pad=0,
    half=True,         # use fp16 for speed
)

# Upscale an image
img = Image.open("low_res_photo.jpg")
img_array = np.array(img)

output, _ = upsampler.enhance(img_array, outscale=4)
Image.fromarray(output).save("upscaled_4x.png")

print(f"Input:  {img.size[0]}x{img.size[1]}")
print(f"Output: {output.shape[1]}x{output.shape[0]}")
# Input:  256x256
# Output: 1024x1024

A 256x256 image becomes a sharp 1024x1024 image in about 1-2 seconds on a GPU. The model adds realistic texture and detail that wasn’t in the original.

Choosing the Right Model

Different models work better for different content types:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from realesrgan import RealESRGANer
from basicsr.archs.rrdbnet_arch import RRDBNet

def load_upscaler(model_type: str = "general") -> RealESRGANer:
    """Load the best upscaler for your content type."""
    configs = {
        "general": {
            "model_path": "RealESRGAN_x4plus.pth",
            "scale": 4,
            "block": 23,
        },
        "anime": {
            "model_path": "RealESRGAN_x4plus_anime_6B.pth",
            "scale": 4,
            "block": 6,
        },
        "face": {
            "model_path": "GFPGANv1.4.pth",
            "scale": 4,
            "block": 23,
        },
    }

    config = configs[model_type]
    model = RRDBNet(
        num_in_ch=3, num_out_ch=3, num_feat=64,
        num_block=config["block"], num_grow_ch=32, scale=config["scale"],
    )

    return RealESRGANer(
        scale=config["scale"],
        model_path=config["model_path"],
        model=model,
        half=True,
    )

# General photos
upscaler = load_upscaler("general")

# Anime/illustrations — cleaner lines, less texture hallucination
upscaler = load_upscaler("anime")
ModelBest ForSpeedQuality
RealESRGAN_x4plusPhotos, general contentFastExcellent
RealESRGAN_x4plus_anime_6BAnime, illustrations, pixel artFasterExcellent for art
SwinIRMaximum quality, researchSlowBest
StableSRFaces, complex scenesSlowestMost detailed

SwinIR for Maximum Quality

SwinIR uses a Swin Transformer architecture and produces the highest quality results, but it’s slower than ESRGAN.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from transformers import AutoImageProcessor, SwinForImageClassification
import torch
from PIL import Image
import torchvision.transforms as T

# Using HuggingFace's SwinIR implementation
from huggingface_hub import hf_hub_download
import importlib

# Download SwinIR model
model_path = hf_hub_download(
    repo_id="caidas/swinIR-M-real-sr-x4",
    filename="003_realSR_BSRGAN_DFO_s64w8_SwinIR-M_x4_GAN.pth"
)

# Alternative: use the swinir package directly
# pip install swinir
from PIL import Image
import numpy as np

def upscale_with_swinir(image_path: str, output_path: str, scale: int = 4):
    """Upscale using SwinIR (higher quality, slower)."""
    img = Image.open(image_path).convert("RGB")
    # SwinIR requires specific preprocessing
    img_tensor = T.ToTensor()(img).unsqueeze(0)

    # Pad to multiple of window size (8)
    _, _, h, w = img_tensor.shape
    pad_h = (8 - h % 8) % 8
    pad_w = (8 - w % 8) % 8
    img_tensor = torch.nn.functional.pad(img_tensor, (0, pad_w, 0, pad_h), mode="reflect")

    print(f"Processing {w}x{h} image with SwinIR...")
    # Model inference would go here
    # output = model(img_tensor)

    return output_path

Batch Processing

For processing entire directories of images:

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import time

def batch_upscale(
    input_dir: str,
    output_dir: str,
    upscaler: RealESRGANer,
    scale: int = 4,
    max_workers: int = 1,  # GPU bound, 1 is usually optimal
) -> dict:
    """Upscale all images in a directory."""
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    image_extensions = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
    images = [f for f in input_path.iterdir() if f.suffix.lower() in image_extensions]

    stats = {"processed": 0, "failed": 0, "total_time": 0}

    for img_path in images:
        try:
            start = time.time()
            img = np.array(Image.open(img_path).convert("RGB"))
            output, _ = upscaler.enhance(img, outscale=scale)

            out_file = output_path / f"{img_path.stem}_upscaled.png"
            Image.fromarray(output).save(out_file)

            elapsed = time.time() - start
            stats["processed"] += 1
            stats["total_time"] += elapsed
            print(f"  {img_path.name}{out_file.name} ({elapsed:.1f}s)")
        except Exception as e:
            stats["failed"] += 1
            print(f"  FAILED {img_path.name}: {e}")

    return stats

stats = batch_upscale("./low_res/", "./high_res/", upscaler)
print(f"\nDone: {stats['processed']} processed, {stats['failed']} failed")
print(f"Average: {stats['total_time'] / max(stats['processed'], 1):.1f}s per image")

Tiling for Large Images and Low VRAM

Large images (2000x2000+) or GPUs with less than 6GB VRAM need tiling — process the image in overlapping patches and stitch them together.

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def upscale_large_image(
    image_path: str,
    output_path: str,
    tile_size: int = 512,
    overlap: int = 32,
    scale: int = 4,
) -> None:
    """Upscale large images using tiling to avoid OOM."""
    # Configure upscaler with tiling
    model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
    upscaler = RealESRGANer(
        scale=scale,
        model_path="RealESRGAN_x4plus.pth",
        model=model,
        tile=tile_size,
        tile_pad=overlap,
        pre_pad=0,
        half=True,
    )

    img = np.array(Image.open(image_path).convert("RGB"))
    print(f"Input: {img.shape[1]}x{img.shape[0]}, Tile size: {tile_size}")

    output, _ = upscaler.enhance(img, outscale=scale)
    Image.fromarray(output).save(output_path)
    print(f"Output: {output.shape[1]}x{output.shape[0]}")

# 4K input with tiling (won't OOM on 4GB VRAM)
upscale_large_image("large_photo.jpg", "large_upscaled.png", tile_size=256)

The tile_pad overlap ensures seamless stitching between tiles — without it, you get visible grid lines at tile boundaries.

Comparing Upscaling Methods

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from PIL import Image
import numpy as np

def compare_methods(image_path: str, scale: int = 4):
    """Compare traditional vs AI upscaling."""
    img = Image.open(image_path).convert("RGB")
    target_size = (img.width * scale, img.height * scale)

    # Method 1: Nearest neighbor (pixelated)
    nearest = img.resize(target_size, Image.NEAREST)
    nearest.save("compare_nearest.png")

    # Method 2: Bicubic (smooth but blurry)
    bicubic = img.resize(target_size, Image.BICUBIC)
    bicubic.save("compare_bicubic.png")

    # Method 3: Lanczos (sharper than bicubic)
    lanczos = img.resize(target_size, Image.LANCZOS)
    lanczos.save("compare_lanczos.png")

    # Method 4: AI upscaling (sharp with added detail)
    img_array = np.array(img)
    output, _ = upsampler.enhance(img_array, outscale=scale)
    Image.fromarray(output).save("compare_ai.png")

    print(f"Saved 4 comparison images at {target_size[0]}x{target_size[1]}")

compare_methods("test_256.jpg")

At 4x upscale, the differences are dramatic. Bicubic produces a blurry mess. Lanczos is slightly better but still soft. AI upscaling produces sharp edges, realistic textures, and plausible detail that wasn’t in the original.

Common Errors and Fixes

CUDA out of memory on large images

Enable tiling: set tile=256 or tile=128 in the upscaler constructor. Smaller tiles use less VRAM but process slower. Also ensure half=True is set for fp16.

Output has visible tile boundaries

Increase tile_pad from 10 to 32 or 64. The padding creates overlap between tiles that gets blended during stitching.

Faces look distorted after upscaling

Use GFPGAN or CodeFormer for face-specific enhancement. Real-ESRGAN is great for general content but can produce uncanny faces. Process faces separately and composite them back:

1
2
3
4
# pip install gfpgan
from gfpgan import GFPGANer
face_enhancer = GFPGANer(model_path="GFPGANv1.4.pth", upscale=4)
_, _, output = face_enhancer.enhance(img_array, paste_back=True)

Artifacts on heavily compressed JPEG inputs

JPEG compression adds block artifacts that the upscaler can amplify. Pre-process with a JPEG artifact removal model, or use the RealESRGAN_x4plus model which was specifically trained on compressed inputs.

CPU-only processing is too slow

Without a GPU, expect 30-60 seconds per image. Use a smaller model (RealESRGAN_x4plus_anime_6B has fewer parameters) or reduce the output scale from 4x to 2x. For batch processing, consider a cloud GPU.