The Quick Version#
If your GPU utilization is below 80% during training, you’re leaving performance on the table. The most common bottleneck isn’t the GPU — it’s the data pipeline starving the GPU of work. Here’s how to find out where your time is going.
1
2
| # Real-time GPU monitoring
watch -n 1 nvidia-smi
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
| import subprocess
import json
import time
def gpu_snapshot() -> list[dict]:
"""Get current GPU stats as structured data."""
result = subprocess.run(
["nvidia-smi", "--query-gpu=index,name,utilization.gpu,utilization.memory,"
"memory.used,memory.total,temperature.gpu,power.draw",
"--format=csv,noheader,nounits"],
capture_output=True, text=True,
)
gpus = []
for line in result.stdout.strip().split("\n"):
parts = [p.strip() for p in line.split(",")]
gpus.append({
"index": int(parts[0]),
"name": parts[1],
"gpu_util_pct": float(parts[2]),
"mem_util_pct": float(parts[3]),
"mem_used_mb": float(parts[4]),
"mem_total_mb": float(parts[5]),
"temp_c": float(parts[6]),
"power_w": float(parts[7]),
})
return gpus
# Continuous monitoring
for _ in range(10):
stats = gpu_snapshot()
for gpu in stats:
print(f"GPU {gpu['index']}: {gpu['gpu_util_pct']:.0f}% util, "
f"{gpu['mem_used_mb']:.0f}/{gpu['mem_total_mb']:.0f} MB, "
f"{gpu['temp_c']}°C, {gpu['power_w']}W")
time.sleep(2)
|
If gpu_util_pct fluctuates between 0% and 100%, your GPU is waiting for data between batches. If it’s steady at 90%+, you’re compute-bound (good — that means the GPU is working at capacity).
PyTorch Profiler: Finding Exact Bottlenecks#
The PyTorch profiler breaks down time spent in each operation — data loading, forward pass, backward pass, optimizer step, and everything in between.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
| import torch
from torch.profiler import profile, record_function, ProfilerActivity
from torchvision.models import resnet50
from torch.utils.data import DataLoader
import torchvision.transforms as T
import torchvision.datasets as datasets
model = resnet50().cuda()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()
# Create a sample dataloader
transform = T.Compose([T.Resize(256), T.CenterCrop(224), T.ToTensor()])
dataset = datasets.FakeData(size=1000, image_size=(3, 224, 224), transform=T.ToTensor())
loader = DataLoader(dataset, batch_size=64, num_workers=4, pin_memory=True)
# Profile 5 training steps
with profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
schedule=torch.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
on_trace_ready=torch.profiler.tensorboard_trace_handler("./profiler_logs"),
record_shapes=True,
profile_memory=True,
with_stack=True,
) as prof:
for step, (images, labels) in enumerate(loader):
if step >= 5:
break
with record_function("data_transfer"):
images = images.cuda(non_blocking=True)
labels = labels.cuda(non_blocking=True)
with record_function("forward"):
outputs = model(images)
loss = criterion(outputs, labels)
with record_function("backward"):
optimizer.zero_grad()
loss.backward()
with record_function("optimizer"):
optimizer.step()
prof.step()
# Print summary
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=15))
|
The output table shows you exactly where time goes:
1
2
3
4
5
6
| Name CPU total CUDA total # Calls
---------------------- ---------- ----------- --------
backward 125.3ms 98.7ms 3
forward 78.2ms 67.4ms 3
data_transfer 12.1ms 3.2ms 3
optimizer 15.4ms 11.8ms 3
|
If data_transfer or DataLoader time dominates, you have a data pipeline bottleneck. If forward and backward dominate, you’re GPU-bound (the ideal case).
Diagnosing Data Pipeline Bottlenecks#
The data pipeline is the #1 bottleneck in most training jobs. Test it in isolation:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
| import time
from torch.utils.data import DataLoader
def benchmark_dataloader(loader: DataLoader, num_batches: int = 50) -> dict:
"""Measure how fast the dataloader can deliver batches."""
times = []
# Warm up
batch_iter = iter(loader)
next(batch_iter)
for i in range(num_batches):
start = time.time()
try:
batch = next(batch_iter)
except StopIteration:
batch_iter = iter(loader)
batch = next(batch_iter)
times.append(time.time() - start)
avg_ms = sum(times) / len(times) * 1000
throughput = len(times) / sum(times)
return {
"avg_batch_time_ms": round(avg_ms, 2),
"batches_per_second": round(throughput, 2),
"slowest_batch_ms": round(max(times) * 1000, 2),
"fastest_batch_ms": round(min(times) * 1000, 2),
}
# Test with different num_workers
for workers in [0, 2, 4, 8, 12]:
loader = DataLoader(
dataset, batch_size=64, num_workers=workers,
pin_memory=True, persistent_workers=workers > 0,
)
stats = benchmark_dataloader(loader)
print(f"workers={workers:2d}: {stats['avg_batch_time_ms']:6.1f}ms/batch, "
f"{stats['batches_per_second']:.1f} batches/sec")
|
Typical results:
1
2
3
4
5
| workers= 0: 45.2ms/batch, 22.1 batches/sec
workers= 2: 18.7ms/batch, 53.5 batches/sec
workers= 4: 9.3ms/batch, 107.5 batches/sec
workers= 8: 5.1ms/batch, 196.1 batches/sec
workers=12: 5.0ms/batch, 200.0 batches/sec
|
If your GPU processes a batch in 20ms but the dataloader takes 45ms to deliver one, the GPU idles 55% of the time. Increasing num_workers to 4-8 usually fixes this.
Memory Profiling#
Track GPU memory allocation to find leaks and optimize batch sizes:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
| import torch
def memory_report() -> dict:
"""Snapshot of current GPU memory usage."""
return {
"allocated_mb": torch.cuda.memory_allocated() / 1024**2,
"reserved_mb": torch.cuda.memory_reserved() / 1024**2,
"max_allocated_mb": torch.cuda.max_memory_allocated() / 1024**2,
}
def find_max_batch_size(model, input_shape: tuple, start: int = 8, device: str = "cuda") -> int:
"""Binary search for the largest batch size that fits in GPU memory."""
model = model.to(device)
low, high = start, start
# Find upper bound
while True:
try:
torch.cuda.empty_cache()
x = torch.randn(high, *input_shape, device=device)
with torch.no_grad():
_ = model(x)
del x
torch.cuda.empty_cache()
high *= 2
except torch.cuda.OutOfMemoryError:
break
# Binary search between low and high
best = low
while low <= high:
mid = (low + high) // 2
try:
torch.cuda.empty_cache()
x = torch.randn(mid, *input_shape, device=device)
with torch.no_grad():
_ = model(x)
del x
torch.cuda.empty_cache()
best = mid
low = mid + 1
except torch.cuda.OutOfMemoryError:
high = mid - 1
torch.cuda.empty_cache()
return best
model = resnet50().cuda().eval()
max_bs = find_max_batch_size(model, (3, 224, 224))
print(f"Max inference batch size: {max_bs}")
# Use 80% of max for safety margin during training (backward pass needs extra memory)
print(f"Recommended training batch size: {int(max_bs * 0.4)}")
|
Continuous Monitoring During Training#
Log GPU stats alongside training metrics to correlate performance issues with training events:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
| import threading
import csv
from datetime import datetime
class GPUMonitor:
"""Background GPU monitoring that logs to CSV."""
def __init__(self, log_path: str = "gpu_metrics.csv", interval: float = 5.0):
self.log_path = log_path
self.interval = interval
self.running = False
self._thread = None
with open(log_path, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["timestamp", "gpu_idx", "util_pct", "mem_used_mb",
"mem_total_mb", "temp_c", "power_w"])
def start(self):
self.running = True
self._thread = threading.Thread(target=self._monitor_loop, daemon=True)
self._thread.start()
def stop(self):
self.running = False
if self._thread:
self._thread.join()
def _monitor_loop(self):
while self.running:
stats = gpu_snapshot()
timestamp = datetime.now().isoformat()
with open(self.log_path, "a", newline="") as f:
writer = csv.writer(f)
for gpu in stats:
writer.writerow([
timestamp, gpu["index"], gpu["gpu_util_pct"],
gpu["mem_used_mb"], gpu["mem_total_mb"],
gpu["temp_c"], gpu["power_w"],
])
time.sleep(self.interval)
# Start monitoring before training
monitor = GPUMonitor(interval=5.0)
monitor.start()
# ... your training loop ...
monitor.stop()
print(f"GPU metrics saved to {monitor.log_path}")
|
Common Errors and Fixes#
GPU utilization is 0% but training is “running”
Your code is stuck on CPU operations — data loading, preprocessing, or Python overhead between batches. Profile with the PyTorch profiler to find exactly where.
GPU utilization fluctuates wildly (0-100%)
Classic data pipeline bottleneck. The GPU finishes a batch fast, then waits for the next one. Increase num_workers, enable pin_memory=True, and use persistent_workers=True to keep worker processes alive between epochs.
Memory grows slowly over training
You’re accumulating tensors that aren’t being freed — usually from appending loss values to a list without calling .item():
1
2
3
4
5
| # BAD: keeps the computation graph in memory
losses.append(loss)
# GOOD: extracts the scalar value
losses.append(loss.item())
|
nvidia-smi shows high memory but low utilization
Memory is allocated but not actively used. This happens with large batch sizes where most memory holds activations during the backward pass. The GPU is still doing useful work — memory utilization and compute utilization are independent metrics.
Temperature throttling reduces performance
If GPU temp exceeds 83°C, most GPUs throttle clock speeds. Improve cooling, reduce power limit (nvidia-smi -pl 250), or add a brief time.sleep(0.01) between batches to let the GPU cool. In data centers, check that the server’s airflow isn’t blocked.