pytorch通过change_current_allocator获取所有的子Module实际的内存占用情况

🕗 发布于 2024-07-20 05:19 pytorch 人工智能 python

pytorch通过change_current_allocator获取所有的子Module实际的内存占用情况

1.背景介绍
2.参考链接
3.自己的内存分配器
4.pytorch测试代码

1.背景介绍

目的:需要准确统计pytorch每一层计算所需的设备内存
问题:对齐的原因,直接使用torch.cuda.memory_allocated()并不准确
方法:
- 设置CUBLAS_WORKSPACE_CONFIG,排除CUBLAS_WORKSPACE的影响
- 使用torch.cuda.memory.change_current_allocator设置自己的内存分配器
- 在自己的内存分配器里记录内存分配情况

2.参考链接

3.自己的内存分配器

tee alloc.cc<<-'EOF'
#include <sys/types.h>
#include <cuda_runtime_api.h>
#include <iostream>
#include <assert.h>
#include <unordered_map>
#include <iostream>
#include <mutex>

// 内存监视器类
class MemoryMonitor {
public:
    // 分配内存并记录
    void* allocate(size_t size) {
        void* ptr;
        cudaMalloc(&ptr,size);
        if (ptr) {
            std::lock_guard<std::mutex> lock(mtx);
            allocations[ptr] = size;
            totalAllocated += size;
        }
        return ptr;
    }

    // 释放内存并记录
    void deallocate(void* ptr) {
        if (ptr) {
            std::lock_guard<std::mutex> lock(mtx);
            auto it = allocations.find(ptr);
            if (it != allocations.end()) {
                totalAllocated -= it->second;
                allocations.erase(it);
            }
            cudaFree(ptr);
        }
    }

    // 获取当前的总分配大小
    size_t getTotalAllocated() const {
        std::lock_guard<std::mutex> lock(mtx);
        return totalAllocated;
    }

private:
    std::unordered_map<void*, size_t> allocations; // 存储分配地址和大小的哈希表
    size_t totalAllocated = 0; // 当前总分配大小
    mutable std::mutex mtx; // 保护数据结构的互斥锁
};

MemoryMonitor monitor;

extern "C" {
    void* my_malloc(ssize_t size, int device, cudaStream_t stream) {
       return monitor.allocate(size);
    }
    void my_free(void* ptr, ssize_t size, int device, cudaStream_t stream) {
       monitor.deallocate(ptr);
    }
    unsigned long long getTotalAllocated()
    {
        return monitor.getTotalAllocated();
    }
}
EOF
g++ alloc.cc -o alloc.so -I/usr/local/cuda/include -shared -fPIC

4.pytorch测试代码


tee torch_mem_stat.py <<-'EOF'
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
os.environ['CUBLAS_WORKSPACE_CONFIG']=":0:0"
import ctypes
import numpy as np
import torch
from torch.nn import Module, Linear
import torch.nn as nn
from torch.optim import Adam,SGD
from dataclasses import dataclass
from typing import Any
import time
import torchvision.models as models
import sys

hook_allocator=int(sys.argv[1])

if hook_allocator==1:
    os.environ['PYTORCH_NO_CUDA_MEMORY_CACHING']='1'
    lib = ctypes.CDLL('./alloc.so')
    lib.getTotalAllocated.restype = ctypes.c_ulonglong
    print("hook_allocator")
    new_alloc = torch.cuda.memory.CUDAPluggableAllocator('./alloc.so', 'my_malloc', 'my_free')
    torch.cuda.memory.change_current_allocator(new_alloc)

def get_memory_allocated():
    if hook_allocator:
        return lib.getTotalAllocated()
    else:
        return torch.cuda.memory_allocated()

# 对象和类名缓存
object_cache = {}
class_name_count = {}

def is_tensor(val):
    return isinstance(val, (torch.Tensor, nn.Parameter))

def describe_tensor_data(tensor,desc=""):
    if is_tensor(tensor):
        desc+=f"[shape({','.join(map(str,list(tensor.shape)))})_dtype({tensor.dtype})]"
    elif isinstance(tensor, (tuple, list)):
        for idx, t in enumerate(tensor):
            desc=describe_tensor_data(t,f"{desc}idx({idx})")
    else:
        desc+=f"[dtype({type(tensor)})]"
    return desc

def get_unique_name(class_name, obj_id):
    # 生成唯一的对象名称
    if class_name not in class_name_count:
        class_name_count[class_name] = 0
    uid = f"{class_name}_{obj_id}"
    if uid not in object_cache:
        class_name_count[class_name] += 1
        object_cache[uid] = {"idx": class_name_count[class_name]}
    return f'-{object_cache[uid]["idx"]}'

def initialize_module_attributes(name,module):
    # 初始化模块属性
    if not hasattr(module, 'uuid'):
        module.uuid = name+get_unique_name(module.__class__.__name__, id(module))
    if not hasattr(module, 'backward_mem'):
        module.backward_mem = 0
    if not hasattr(module, 'forward_mem'):
        module.forward_mem = 0
    if not hasattr(module, 'fwd_mem_sz'):
        module.fwd_mem_sz = None
    if not hasattr(module, 'bwd_mem_sz'):
        module.bwd_mem_sz = None    
        
def pre_backward_hook(module, grad_input):
    module.backward_mem=get_memory_allocated()

def post_backward_hook(module, grad_input, grad_output):
    memory_allocated=get_memory_allocated()
    module.bwd_mem_sz=memory_allocated-module.backward_mem
    rank=0
    if torch.distributed.is_initialized():
        rank=torch.distributed.get_rank()    
    if rank==0:
        with open("torch_module_mem_info.txt","a+") as f:
            f.write(f"bwd-{module.uuid}#{module.bwd_mem_sz}#{memory_allocated}#{describe_tensor_data(grad_input)}#{describe_tensor_data(grad_output)}\n")
    
def pre_forward_hook(module, input):   
    module.forward_mem=get_memory_allocated()

def post_forward_hook(module, input, output):
    memory_allocated=get_memory_allocated()
    module.fwd_mem_sz=memory_allocated-module.forward_mem
    rank=0
    if torch.distributed.is_initialized():
        rank=torch.distributed.get_rank()    
    if rank==0:    
        with open("torch_module_mem_info.txt","a+") as f:
            f.write(f"fwd-{module.uuid}#{module.fwd_mem_sz}#{memory_allocated}#{describe_tensor_data(input)}#{describe_tensor_data(output)}\n")

def register_forward_hooks(name,module):
    initialize_module_attributes(name,module)
    module.register_forward_pre_hook(pre_forward_hook)
    module.register_forward_hook(post_forward_hook)

def register_backward_hooks(name,module):
    initialize_module_attributes(name,module)
    module.register_full_backward_pre_hook(pre_backward_hook)
    module.register_full_backward_hook(post_backward_hook)

class HookModel(object):
    def __init__(self, model):
        output_dict = {}
        self.get_submodule_recrusicve(model, "", output_dict)
        for name, module in output_dict.items():
            if name.endswith("Sequential"):
                continue
            register_forward_hooks(name,module)
            register_backward_hooks(name,module)
    def get_submodule_recrusicve(self,module, prefix, output_dict):
        prefix = prefix + "/" + type(module).__name__
        output_dict[prefix] = module
        for name, submodule in module.named_children():
            self.get_submodule_recrusicve(submodule, f"{prefix}.{name}", output_dict)

class FeedForward(Module):
    def __init__(self,hidden_size,ffn_size):
        super().__init__()
        self.fc = nn.Sequential(
            Linear(in_features=hidden_size, out_features=ffn_size,bias=False),
            nn.ReLU(),
            Linear(in_features=ffn_size, out_features=ffn_size*2,bias=False),
            nn.Dropout(0.5),
            Linear(in_features=ffn_size*2, out_features=hidden_size,bias=False),
        )
        self.norm = nn.LayerNorm(normalized_shape=hidden_size, elementwise_affine=False)
 
    def forward(self, x):
        return x + self.fc(self.norm(x))

def main():
    model=FeedForward(100,128) 
    model=model.float().cuda()
    model.train()
    obj=HookModel(model)
    opt=Adam(model.parameters(),lr=0.001)
    input=torch.randn(1,100).float().cuda()
    with open("torch_module_mem_info.txt","w") as f:
        f.write("")
    for i in range(1):
        output=model(input)
        loss=-torch.log(output.sum())
        opt.zero_grad()
        loss.backward()
        opt.step()
main()
EOF

python torch_mem_stat.py 0
cat torch_module_mem_info.txt
python torch_mem_stat.py 1
cat torch_module_mem_info.txt

输出

#默认的分配器
fwd-/FeedForward.norm/LayerNorm-1#512#285696#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.0/Linear-1#512#286208#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.1/ReLU-1#512#286720#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.2/Linear-2#1024#287232#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.3/Dropout-1#1536#288768#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.4/Linear-3#512#288256#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward-1#3072#288256#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward-1#0#289792#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.4/Linear-3#102400#392192#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.3/Dropout-1#512#392192#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.2/Linear-2#131584#522752#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.1/ReLU-1#0#521728#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,128)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.0/Linear-1#0#521216#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,128)_dtype(torch.float32)]

#自定义分配器
fwd-/FeedForward.norm/LayerNorm-1#400#285472#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.0/Linear-1#512#285984#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.1/ReLU-1#512#286496#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,128)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.2/Linear-2#1024#287008#idx(0)[shape(1,128)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.3/Dropout-1#1280#288288#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,256)_dtype(torch.float32)]
fwd-/FeedForward.fc/Sequential.4/Linear-3#400#287664#idx(0)[shape(1,256)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
fwd-/FeedForward-1#2592#287664#idx(0)[shape(1,100)_dtype(torch.float32)]#[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward-1#0#287676#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.4/Linear-3#102400#390076#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,100)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.3/Dropout-1#768#390840#idx(0)[shape(1,256)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.2/Linear-2#131584#521400#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,256)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.1/ReLU-1#0#520376#idx(0)[shape(1,128)_dtype(torch.float32)]#idx(0)[shape(1,128)_dtype(torch.float32)]
bwd-/FeedForward.fc/Sequential.0/Linear-1#0#519864#idx(0)[dtype(<class 'NoneType'>)]#idx(0)[shape(1,128)_dtype(torch.float32)]

原文地址：https://blog.csdn.net/m0_61864577/article/details/140559116

免责声明：本站文章内容转载自网络资源，如本站内容侵犯了原著者的合法权益，可联系本站删除。更多内容请关注自学内容网（zxcms.com）！

上一篇：文章八：并发性能优化技巧
下一篇：【C++】C++标准库iostream中cin、cout、cerr、clog方法解析

如何构建安全可靠的 HarmonyOS 应用
本文将深入探讨 HarmonyOS App 的安全编码规范与最佳实践，帮助开发者在代码编写中避免常见的安全漏洞，如 SQL 注入、XSS攻击等。我们将提供具体的编码示例，并结合ArkUI和ArkTS实
阅读更多2024-11-16
js像循环数组那样循环一个数字，Array.from()
js像循环数组那样循环一个数字，Array.from()
阅读更多2024-11-16
【C++笔记】vector使用详解及模拟实现
vector的文档使用STL的三个境界：能用、明理、能扩展，下面学习vector，我们也按照这个境界去学习。vector是可以改变大小的数组序列容器，也就是数据结构的顺序表。构造函数声明接口说明vec
阅读更多2024-11-16
Java线程池：ThreadPoolExecutor原理解析
本文介绍了线程池的基本概念、主要参数、工作流程，以及 execute() 方法的源码分析，此外，还讨论了在实际应用中可能遇到的陷阱和问题。
阅读更多2024-11-16
1.两数之和-力扣（LeetCode）
1.两数之和-力扣（LeetCode）
阅读更多2024-11-16
Xss挑战（跨脚本攻击）
这里将script，on，src，data，href，进行了过滤，并且在尝试的时候关键字双写不能用了，那么这里直接选择不去闭合标签，直接使用伪协议，但是发现javascript也被拆开了，这里可以对伪
阅读更多2024-11-16
《Python 网络爬虫》
本文介绍了 Python 网络爬虫的基本概念、技术原理、常用工具以及实战案例，希望能够帮助读者快速掌握 Python 网络爬虫技术。在实际应用中，需要根据具体的需求选择合适的工具和方法，并注意遵守法律
阅读更多2024-11-16
UEFI学习（五）——启动框架
https://www.zhihu.com/question/36313402/answer/2398532123UEFI（统一可扩展固件接口）在启动过程主要有以下几个阶段：
阅读更多2024-11-16
【洛谷】T539820 202411A Giants
C++ LGR-207-Div.4】洛谷入门赛 #29 第一题
阅读更多2024-11-16
Python 正则表达式进阶用法：量词与范围
匹配前面的字符零次或多次，相当于“任意多次”。：匹配前面的字符一次或多次，相当于“至少一次”。?：匹配前面的字符零次或一次，相当于“可有可无”。{n}：匹配前面的字符n次。{n,}：匹配前面的字符至少
阅读更多2024-11-16

pytorch通过change_current_allocator获取所有的子Module实际的内存占用情况

pytorch通过change_current_allocator获取所有的子Module实际的内存占用情况

1.背景介绍

2.参考链接

3.自己的内存分配器

4.pytorch测试代码

相关文章