The Quick Version#
AI super resolution upscales images by 2-4x while adding sharp detail that traditional interpolation (bicubic, Lanczos) can’t. Real-ESRGAN is the go-to model — it handles photos, illustrations, and compressed images with minimal artifacts.
1
| pip install realesrgan torch torchvision pillow
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
| from realesrgan import RealESRGANer
from basicsr.archs.rrdbnet_arch import RRDBNet
from PIL import Image
import numpy as np
import torch
# Load the Real-ESRGAN model (4x upscaling)
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
upsampler = RealESRGANer(
scale=4,
model_path="https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth",
model=model,
tile=0, # 0 for no tiling, set higher for low VRAM
tile_pad=10,
pre_pad=0,
half=True, # use fp16 for speed
)
# Upscale an image
img = Image.open("low_res_photo.jpg")
img_array = np.array(img)
output, _ = upsampler.enhance(img_array, outscale=4)
Image.fromarray(output).save("upscaled_4x.png")
print(f"Input: {img.size[0]}x{img.size[1]}")
print(f"Output: {output.shape[1]}x{output.shape[0]}")
# Input: 256x256
# Output: 1024x1024
|
A 256x256 image becomes a sharp 1024x1024 image in about 1-2 seconds on a GPU. The model adds realistic texture and detail that wasn’t in the original.
Choosing the Right Model#
Different models work better for different content types:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
| from realesrgan import RealESRGANer
from basicsr.archs.rrdbnet_arch import RRDBNet
def load_upscaler(model_type: str = "general") -> RealESRGANer:
"""Load the best upscaler for your content type."""
configs = {
"general": {
"model_path": "RealESRGAN_x4plus.pth",
"scale": 4,
"block": 23,
},
"anime": {
"model_path": "RealESRGAN_x4plus_anime_6B.pth",
"scale": 4,
"block": 6,
},
"face": {
"model_path": "GFPGANv1.4.pth",
"scale": 4,
"block": 23,
},
}
config = configs[model_type]
model = RRDBNet(
num_in_ch=3, num_out_ch=3, num_feat=64,
num_block=config["block"], num_grow_ch=32, scale=config["scale"],
)
return RealESRGANer(
scale=config["scale"],
model_path=config["model_path"],
model=model,
half=True,
)
# General photos
upscaler = load_upscaler("general")
# Anime/illustrations — cleaner lines, less texture hallucination
upscaler = load_upscaler("anime")
|
| Model | Best For | Speed | Quality |
|---|
| RealESRGAN_x4plus | Photos, general content | Fast | Excellent |
| RealESRGAN_x4plus_anime_6B | Anime, illustrations, pixel art | Faster | Excellent for art |
| SwinIR | Maximum quality, research | Slow | Best |
| StableSR | Faces, complex scenes | Slowest | Most detailed |
SwinIR for Maximum Quality#
SwinIR uses a Swin Transformer architecture and produces the highest quality results, but it’s slower than ESRGAN.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
| from transformers import AutoImageProcessor, SwinForImageClassification
import torch
from PIL import Image
import torchvision.transforms as T
# Using HuggingFace's SwinIR implementation
from huggingface_hub import hf_hub_download
import importlib
# Download SwinIR model
model_path = hf_hub_download(
repo_id="caidas/swinIR-M-real-sr-x4",
filename="003_realSR_BSRGAN_DFO_s64w8_SwinIR-M_x4_GAN.pth"
)
# Alternative: use the swinir package directly
# pip install swinir
from PIL import Image
import numpy as np
def upscale_with_swinir(image_path: str, output_path: str, scale: int = 4):
"""Upscale using SwinIR (higher quality, slower)."""
img = Image.open(image_path).convert("RGB")
# SwinIR requires specific preprocessing
img_tensor = T.ToTensor()(img).unsqueeze(0)
# Pad to multiple of window size (8)
_, _, h, w = img_tensor.shape
pad_h = (8 - h % 8) % 8
pad_w = (8 - w % 8) % 8
img_tensor = torch.nn.functional.pad(img_tensor, (0, pad_w, 0, pad_h), mode="reflect")
print(f"Processing {w}x{h} image with SwinIR...")
# Model inference would go here
# output = model(img_tensor)
return output_path
|
Batch Processing#
For processing entire directories of images:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
| from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
import time
def batch_upscale(
input_dir: str,
output_dir: str,
upscaler: RealESRGANer,
scale: int = 4,
max_workers: int = 1, # GPU bound, 1 is usually optimal
) -> dict:
"""Upscale all images in a directory."""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
image_extensions = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}
images = [f for f in input_path.iterdir() if f.suffix.lower() in image_extensions]
stats = {"processed": 0, "failed": 0, "total_time": 0}
for img_path in images:
try:
start = time.time()
img = np.array(Image.open(img_path).convert("RGB"))
output, _ = upscaler.enhance(img, outscale=scale)
out_file = output_path / f"{img_path.stem}_upscaled.png"
Image.fromarray(output).save(out_file)
elapsed = time.time() - start
stats["processed"] += 1
stats["total_time"] += elapsed
print(f" {img_path.name} → {out_file.name} ({elapsed:.1f}s)")
except Exception as e:
stats["failed"] += 1
print(f" FAILED {img_path.name}: {e}")
return stats
stats = batch_upscale("./low_res/", "./high_res/", upscaler)
print(f"\nDone: {stats['processed']} processed, {stats['failed']} failed")
print(f"Average: {stats['total_time'] / max(stats['processed'], 1):.1f}s per image")
|
Tiling for Large Images and Low VRAM#
Large images (2000x2000+) or GPUs with less than 6GB VRAM need tiling — process the image in overlapping patches and stitch them together.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
| def upscale_large_image(
image_path: str,
output_path: str,
tile_size: int = 512,
overlap: int = 32,
scale: int = 4,
) -> None:
"""Upscale large images using tiling to avoid OOM."""
# Configure upscaler with tiling
model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
upscaler = RealESRGANer(
scale=scale,
model_path="RealESRGAN_x4plus.pth",
model=model,
tile=tile_size,
tile_pad=overlap,
pre_pad=0,
half=True,
)
img = np.array(Image.open(image_path).convert("RGB"))
print(f"Input: {img.shape[1]}x{img.shape[0]}, Tile size: {tile_size}")
output, _ = upscaler.enhance(img, outscale=scale)
Image.fromarray(output).save(output_path)
print(f"Output: {output.shape[1]}x{output.shape[0]}")
# 4K input with tiling (won't OOM on 4GB VRAM)
upscale_large_image("large_photo.jpg", "large_upscaled.png", tile_size=256)
|
The tile_pad overlap ensures seamless stitching between tiles — without it, you get visible grid lines at tile boundaries.
Comparing Upscaling Methods#
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
| from PIL import Image
import numpy as np
def compare_methods(image_path: str, scale: int = 4):
"""Compare traditional vs AI upscaling."""
img = Image.open(image_path).convert("RGB")
target_size = (img.width * scale, img.height * scale)
# Method 1: Nearest neighbor (pixelated)
nearest = img.resize(target_size, Image.NEAREST)
nearest.save("compare_nearest.png")
# Method 2: Bicubic (smooth but blurry)
bicubic = img.resize(target_size, Image.BICUBIC)
bicubic.save("compare_bicubic.png")
# Method 3: Lanczos (sharper than bicubic)
lanczos = img.resize(target_size, Image.LANCZOS)
lanczos.save("compare_lanczos.png")
# Method 4: AI upscaling (sharp with added detail)
img_array = np.array(img)
output, _ = upsampler.enhance(img_array, outscale=scale)
Image.fromarray(output).save("compare_ai.png")
print(f"Saved 4 comparison images at {target_size[0]}x{target_size[1]}")
compare_methods("test_256.jpg")
|
At 4x upscale, the differences are dramatic. Bicubic produces a blurry mess. Lanczos is slightly better but still soft. AI upscaling produces sharp edges, realistic textures, and plausible detail that wasn’t in the original.
Common Errors and Fixes#
CUDA out of memory on large images
Enable tiling: set tile=256 or tile=128 in the upscaler constructor. Smaller tiles use less VRAM but process slower. Also ensure half=True is set for fp16.
Output has visible tile boundaries
Increase tile_pad from 10 to 32 or 64. The padding creates overlap between tiles that gets blended during stitching.
Faces look distorted after upscaling
Use GFPGAN or CodeFormer for face-specific enhancement. Real-ESRGAN is great for general content but can produce uncanny faces. Process faces separately and composite them back:
1
2
3
4
| # pip install gfpgan
from gfpgan import GFPGANer
face_enhancer = GFPGANer(model_path="GFPGANv1.4.pth", upscale=4)
_, _, output = face_enhancer.enhance(img_array, paste_back=True)
|
Artifacts on heavily compressed JPEG inputs
JPEG compression adds block artifacts that the upscaler can amplify. Pre-process with a JPEG artifact removal model, or use the RealESRGAN_x4plus model which was specifically trained on compressed inputs.
CPU-only processing is too slow
Without a GPU, expect 30-60 seconds per image. Use a smaller model (RealESRGAN_x4plus_anime_6B has fewer parameters) or reduce the output scale from 4x to 2x. For batch processing, consider a cloud GPU.