pytorch通过change_current_allocator获取所有的子Module实际的内存占用情况
pytorch通过change_current_allocator获取所有的子Module实际的内存占用情况
1.背景介绍
- 目的:需要准确统计pytorch每一层计算所需的设备内存
- 问题:对齐的原因,直接使用torch.cuda.memory_allocated()并不准确
- 方法:
- 设置CUBLAS_WORKSPACE_CONFIG,排除CUBLAS_WORKSPACE的影响
- 使用torch.cuda.memory.change_current_allocator设置自己的内存分配器
- 在自己的内存分配器里记录内存分配情况
2.参考链接
3.自己的内存分配器
tee alloc.cc<<-'EOF'
#include <sys/types.h>
#include <cuda_runtime_api.h>
#include <iostream>
#include <assert.h>
#include <unordered_map>
#include <iostream>
#include <mutex>
// 内存监视器类
class MemoryMonitor {
public:
// 分配内存并记录
void* allocate(size_t size) {
void* ptr;
cudaMalloc(&ptr,size);
if (ptr) {
std::lock_guard<std::mutex> lock(mtx);
allocations[ptr] = size;
totalAllocated += size;
}
return ptr;
}
// 释放内存并记录
void deallocate(void* ptr) {
if (ptr) {
std::lock_guard<std::mutex> lock(mtx);
auto it = allocations.find(ptr);
if (it != allocations.end()) {
totalAllocated -= it->second;
allocations.erase(it);
}
cudaFree(ptr);
}
}
// 获取当前的总分配大小
size_t getTotalAllocated() const {
std::lock_guard<std::mutex> lock(mtx);
return totalAllocated;
}
private:
std::unordered_map<void*, size_t> allocations; // 存储分配地址和大小的哈希表
size_t totalAllocated = 0; // 当前总分配大小
mutable std::mutex mtx; // 保护数据结构的互斥锁
};
MemoryMonitor monitor;
extern "C" {
void* my_malloc(ssize_t size, int device, cudaStream_t stream) {
return monitor.allocate(size);
}
void my_free(void* ptr, ssize_t size, int device, cudaStream_t stream) {
monitor.deallocate(ptr);
}
unsigned long long getTotalAllocated()
{
return monitor.getTotalAllocated();
}
}
EOF
g++ alloc.cc -o alloc.so -I/usr/local/cuda/include -shared -fPIC
4.pytorch测试代码
tee torch_mem_stat.py <<-'EOF'
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['CUBLAS_WORKSPACE_CONFIG']=":0:0"
import ctypes
import numpy as np
import torch
from torch.nn import Module, Linear
import torch.nn as nn
from torch.optim import Adam,SGD
from dataclasses import dataclass
from typing import Any
import time
import torchvision.models as models
import sys
hook_allocator=int(sys.argv[1])
if hook_allocator==1:
os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING']='1'
lib = ctypes.CDLL('./alloc.so')
lib.getTotalAllocated.restype = ctypes.c_ulonglong
print("hook_allocator")
new_alloc = torch.cuda.memory.CUDAPluggableAllocator('./alloc.so', 'my_malloc', 'my_free')
torch.cuda.memory.change_current_allocator(new_alloc)
def get_memory_allocated():
if hook_allocator:
return lib.getTotalAllocated()
else:
return torch.cuda.memory_allocated()
# 对象和类名缓存
object_cache = {}
class_name_count = {}
def is_tensor(val):
return isinstance(val, (torch.Tensor, nn.Parameter))
def describe_tensor_data(tensor,desc=""):
if is_tensor(tensor):
desc+=f"[shape({','.join(map(str,list(tensor.shape)))})_dtype({tensor.dtype})]"
elif isinstance(tensor, (tuple, list)):
for idx, t in enumerate(tensor):
desc=describe_tensor_data(t,f"{desc}idx({idx})")
else:
desc+=f"[dtype({type(tensor)})]"
return desc
def get_unique_name(class_name, obj_id):
# 生成唯一的对象名称
if class_name not in class_name_count:
class_name_count[class_name] = 0
uid = f"{class_name}_{obj_id}"
if uid not in object_cache:
class_name_count[class_name] += 1
object_cache[uid] = {"idx": class_name_count[class_name]}
return f'-{object_cache[uid]["idx"]}'
def initialize_module_attributes(name,module):
# 初始化模块属性
if not hasattr(module, 'uuid'):
module.uuid = name+get_unique_name(module.__class__.__name__, id(module))
if not hasattr(module, 'backward_mem'):
module.backward_mem = 0
if not hasattr(module, 'forward_mem'):
module.forward_mem = 0
if not hasattr(module, 'fwd_mem_sz'):
module.fwd_mem_sz = None
if not hasattr(module, 'bwd_mem_sz'):
module.bwd_mem_sz = None
def pre_backward_hook(module, grad_input):
module.backward_mem=get_memory_allocated()
def post_backward_hook(module, grad_input, grad_output):
memory_allocated=get_memory_allocated()
module.bwd_mem_sz=memory_allocated-module.backward_mem
rank=0
if torch.distributed.is_initialized():
rank=torch.distributed.get_rank()
if rank==0:
with open("torch_module_mem_info.txt","a+") as f:
f.write(f"bwd-{module.uuid}#{module.bwd_mem_sz}#{memory_allocated}#{describe_tensor_data(grad_input)}#{describe_tensor_data(grad_output)}\n")
def pre_forward_hook(module, input):
module.forward_mem=get_memory_allocated()
def post_forward_hook(module, input, output):
memory_allocated=get_memory_allocated()
module.fwd_mem_sz=memory_allocated-module.forward_mem
rank=0
if torch.distributed.is_initialized():
rank=torch.distributed.get_rank()
if rank==0:
with open("torch_module_mem_info.txt","a+") as f:
f.write(f"fwd-{module.uuid}#{module.fwd_mem_sz}#{memory_allocated}#{describe_tensor_data(input)}#{describe_tensor_data(output)}\n")
def register_forward_hooks(name,module):
initialize_module_attributes(name,module)
module.register_forward_pre_hook(pre_forward_hook)
module.register_forward_hook(post_forward_hook)
def register_backward_hooks(name,module):
initialize_module_attributes(name,module)
module.register_full_backward_pre_hook(pre_backward_hook)
module.register_full_backward_hook(post_backward_hook)
class HookModel(object):
def __init__(self, model):
output_dict = {}
self.get_submodule_recrusicve(model, "", output_dict)
for name, module in output_dict.items():
if name.endswith("Sequential"):
continue
register_forward_hooks(name,module)
register_backward_hooks(name,module)
def get_submodule_recrusicve(self,module, prefix, output_dict):
prefix = prefix + "/" + type(module).__name__
output_dict[prefix] = module
for name, submodule in module.named_children():
self.get_submodule_recrusicve(submodule, f"{prefix}.{name}", output_dict)
class FeedForward(Module):
def __init__(self,hidden_size,ffn_size):
super().__init__()
self.fc = nn.Sequential(
Linear(in_features=hidden_size, out_features=ffn_size,bias=False),
nn.ReLU(),
Linear(in_features=ffn_size, out_features=ffn_size*2,bias=False),
nn.Dropout(0.5),
Linear(in_features=ffn_size*2, out_features=hidden_size,bias=False),
)
self.norm = nn.LayerNorm(normalized_shape=hidden_size, elementwise_affine=False)
def forward(self, x):
return x + self.fc(self.norm(x))
def main():
model=FeedForward(100,128)
model=model.float().cuda()
model.train()
obj=HookModel(model)
opt=Adam(model.parameters(),lr=0.001)
input=torch.randn(1,100).float().cuda()
with open("torch_module_mem_info.txt","w") as f:
f.write("")
for i in range(1):
output=model(input)
loss=-torch.log(output.sum())
opt.zero_grad()
loss.backward()
opt.step()
main()
EOF
python torch_mem_stat.py 0
cat torch_module_mem_info.txt
python torch_mem_stat.py 1
cat torch_module_mem_info.txt
输出
#默认的分配器
fwd-/FeedForward.norm/LayerNorm-1#512#285696#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.0/Linear-1#512#286208#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.1/ReLU-1#512#286720#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.2/Linear-2#1024#287232#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.3/Dropout-1#1536#288768#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.4/Linear-3#512#288256#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward-1#3072#288256#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward-1#0#289792#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.4/Linear-3#102400#392192#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.3/Dropout-1#512#392192#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.2/Linear-2#131584#522752#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.1/ReLU-1#0#521728#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,128)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.0/Linear-1#0#521216#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,128)_dtype(torch.float32)]
#自定义分配器
fwd-/FeedForward.norm/LayerNorm-1#400#285472#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.0/Linear-1#512#285984#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.1/ReLU-1#512#286496#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.2/Linear-2#1024#287008#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.3/Dropout-1#1280#288288#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.4/Linear-3#400#287664#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward-1#2592#287664#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward-1#0#287676#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.4/Linear-3#102400#390076#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.3/Dropout-1#768#390840#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.2/Linear-2#131584#521400#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.1/ReLU-1#0#520376#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,128)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.0/Linear-1#0#519864#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,128)_dtype(torch.float32)]
原文地址:https://blog.csdn.net/m0_61864577/article/details/140559116
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!