自学内容网 自学内容网

SSHPASS或者rsync远程自动连接服务器并且在docker中跑脚本

背景:
一段脚本,需要在不同服务器上去跑,每次手动连接太麻烦,所以考虑用sshpas和sync来。
可以在脚本中配置多台服务器,然后自动去跑脚本。
配置文件

配置文件如下:
脚本主要通过[xxx]中的内容来解析脚本,所以不要重复里面的内容


# cant connect
[L20]
domain      = private_name
arch        = gpu
port        = 22
ip          = 1.0.0.1
password    = 123456
user        = root
device_id   = 2


[V100S_PCIe]
domain      = private_name
arch        = gpu
port        = 22
ip          = 10.10.10.10
password    = 123456
user        = root
device_id   = 0

详细的脚本具体如下,主要内容:
1)解析上面的脚本,分别存放在变量中
2)远程连接server
3) 同步文件
4)执行脚本
5)将结果同步回来

#!/bin/bash

# very important, otherwise the script will not work
histchars=

usage="Usage: $0 [Options]
Options:
    -f                 Forcibly delete container
exp:
    1. $0 -f
"

while getopts ':hf' opt; do
    case "$opt" in
    f)
        FORCE_DELETE_DOCKER=true
        ;;
    ? | h)
        echo "$usage"
        exit 1
        ;;
    esac
done


# Get DEBUG from environment, default to 0 if not set
LOG_LEVEL=${DEBUG:-0}

# Constants Variables
INVALID="invalid"
VALID="valid"
ACTIVE="active"
INACTIVE="inactive"
CUDA="cuda"
TOPS="tops"
RETRY_TIMES=5

# Error code
ERROR_SUCCESS=0
ERROR_CFG_PARAM_INVALID_EMPTY=1
ERROR_CFG_PARAM_INVALID_IPADDR=2
ERROR_CFG_NOT_FOUND=3
ERROR_CURRENT_DEVICE_IS_BUSY=4
ERROR_REMOTE_SSH_COMMAND_FAILED=5
ERROR_REMOTE_SSH_RSYN_FAILED=6
ERROR_CFG_PARAM_INVALID_DEVICE_ID=7
ERROR_CFG_PARAM_INVALID_DEVICE_NAME=8


# log for debug
WARN() {
    local RED='\033[0;31m'
    local NC='\033[0m' # No Color
    local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
    echo -e "${RED}[WARN ] ${TIMESTAMP}: $1${NC}"
}

DEBUG() {
    if [ "$LOG_LEVEL" -ge 2 ]; then
        local GREEN='\033[0;32m'
        local NC='\033[0m' # No Color
        local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
        echo -e "${GREEN}[DEBUG] ${TIMESTAMP}: $1${NC}"
    fi
}

INFO() {
    if [ "$LOG_LEVEL" -ge 1 ]; then
        local GREEN='\033[0;32m'
        local NC='\033[0m' # No Color
        local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
        echo -e "${GREEN}[INFO ] ${TIMESTAMP}: $1${NC}"
    fi
}

TRACE() {
    local GREEN='\033[0;32m'
    local NC='\033[0m' # No Color
    local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
    echo -e "${GREEN}[TRACE] ${TIMESTAMP}: $1${NC}"
}

function error_msg() {
    case $1 in
    $ERROR_SUCCESS)
        echo "Success"
        ;;
    $ERROR_CFG_PARAM_INVALID_EMPTY)
        echo "Invalid config params empty"
        ;;
    $ERROR_CFG_PARAM_INVALID_IPADDR)
        echo "Invalid config params ip address"
        ;;
    $ERROR_CFG_NOT_FOUND)
        echo "Config file not found"
        ;;
    $ERROR_CURRENT_DEVICE_IS_BUSY)
        echo "Current device is busy"
        ;;
    $ERROR_REMOTE_SSH_COMMAND_FAILED)
        echo "Remote ssh command failed"
        ;;
    $ERROR_REMOTE_SSH_RSYN_FAILED)
        echo "Remote rsync command failed"
        ;;
    $ERROR_CFG_PARAM_INVALID_DEVICE_ID)
        echo "Invalid config params device id"
        ;;
    $ERROR_CFG_PARAM_INVALID_DEVICE_NAME)
        echo "Invalid config params device id"
        ;;
    *)
        echo "Unknown error"
        ;;
    esac
}

# Global variables
CURRENT_PATH=$(dirname "$(realpath "$0")")
LOCAL_PATH=$(find "$CURRENT_PATH" -type d -name "src" -exec dirname {} \; | head -n 1)
CFG_FILE="remote_cfg_template.txt"
EXCLUDE_FILE=".rsync_exclude_file"
ARCH="gpu"
SUDO="sudo"
IP="10.9.113.22"
PORT="22"
PASSWORD="123456"
USER="root"
CHIPBENCH_DOCKER_NAME="chipbenchmark.gpu"
LOG_NAME="REPORT"
DEVICE_ID="0"
REMOTE_PATH="/root"
DOMAIN="remote"

DEBUG "DEBUG: $LOG_LEVEL"

function string_trim()
{
    echo "$1" | sed 's/^[[:space:]]*\(.*[^[:space:]]\)\([[:space:]]*\)$/\1/g'
}

function get_region() {
    local cfg_file=$1
    local user_id=$2
    local ele_num=$(cat -n $cfg_file | grep "\\[.*\\]" | grep -A 1  "\\[$user_id\\]" | awk '{print $1}' | wc -l)
    local lines=$(cat -n $cfg_file | grep "\\[.*\\]" | grep -A 1  "\\[$user_id\\]" | awk '{print $1}' | xargs)
    if [ $ele_num -eq 1 ]; then
        last_line=$(wc -l < "$cfg_file")
        echo "$lines" "$last_line"
    else
        echo "$lines"
    fi
}

function get_config() {
    local cfg_file_original=$1
    local user_id=$2
    local cfg_name=$3
    local random_str=$(date +%s%N)
    local cfg_file=".tmp_cfg_file_${random_str}"
    cat $cfg_file_original | grep -vE '^#|^$' > $cfg_file
    local region=$(get_region $cfg_file $user_id)
    local start_line=$(echo $region | awk '{print $1}')
    local end_line=$(echo $region | awk '{print $2}')
    string_trim $(sed -n "${start_line}, ${end_line} s/\(${cfg_name}.*=.*\)/\1/p" $cfg_file | awk -F= '{print $2}')
    rm -rf $cfg_file
}

function get_cfg_id_list() {
    local cfg_file=$1
    local num_list=$(cat ${cfg_file} |grep -vE '^#|^$' | grep "\\[.*\\]" | grep -oP '(?<=\[).+?(?=\])'| xargs)
    echo $num_list
}

function parse_config_file(){
    DOMAIN=$(get_config remote_cfg_template.txt $1 domain)
    PORT=$(get_config remote_cfg_template.txt $1 port)
    ARCH=$(get_config remote_cfg_template.txt $1 arch)
    IP=$(get_config remote_cfg_template.txt $1 ip)
    PASSWORD=$(get_config remote_cfg_template.txt $1 password)
    USER=$(get_config remote_cfg_template.txt $1 user)
    DEVICE_ID=$(get_config remote_cfg_template.txt $1 device_id)
}


function check_ipaddr_is_correct()
{
    local ret=$ERROR_SUCCESS
    # Check if the string contains letters
    if echo "$1" | grep -q '[a-zA-Z]'; then
        ret=$ERROR_CFG_PARAM_INVALID_IPADDR
    else
         echo $1|grep "^[0-9]\{1,3\}\.\([0-9]\{1,3\}\.\)\{2\}[0-9]\{1,3\}$" > /dev/null;
         if [ $? -ne 0 ];then
             ret=$ERROR_CFG_PARAM_INVALID_IPADDR
         fi
         ipaddr=$1
         a=`echo $ipaddr|awk -F . '{print $1}'`  #Separated by ".", fetch the values for each column
         b=`echo $ipaddr|awk -F . '{print $2}'`
         c=`echo $ipaddr|awk -F . '{print $3}'`
         d=`echo $ipaddr|awk -F . '{print $4}'`
         for num in $a $b $c $d
         do
             if [ $num -gt 255 ] || [ $num -lt 0 ]    #Each value must be between 0 and 255
             then
                 ret=$ERROR_CFG_PARAM_INVALID_IPADDR
             fi
        done
    fi
   echo $ret
}

function do_cmd() {
    if [ -n "$1" ]; then
        DEBUG "$1"
        eval "$1"
    else
        WARN "cmd str is null."
    fi
}

function do_cmd_silent() {
    if [ -n "$1" ]; then
        DEBUG "$1"
        eval "$1" > /dev/null 2>&1 #silent menas no output
    else
        WARN "cmd str is null."
    fi
}

function do_remote_cmd() {
    if [ -n "$1" ]; then
        if [ $USER == "root" ]; then
            CMD="SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} \"$1\""
        else
            CMD="echo '${PASSWORD}' | SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} -S --prompt= \"$1\""
        fi
        DEBUG "$CMD"
        eval "$CMD"
    else
        WARN "cmd str is null."
    fi
}

function do_remote_cmd_silent() {
    local ret=$ERROR_SUCCESS
    if [ -n "$1" ]; then
        if [ $USER == "root" ]; then
            CMD="SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} \"$1\""
        else
            CMD="echo '${PASSWORD}' | SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} -S --prompt= \"$1\""
        fi
        DEBUG "$CMD"
        eval "$CMD" > /dev/null 2>&1 #silent menas no output
        if [ $? -ne 0 ]; then
            ret=$ERROR_REMOTE_SSH_COMMAND_FAILED
        fi
    else
        WARN "cmd str is null."
        ret=$ERROR_REMOTE_SSH_COMMAND_FAILED
    fi
    echo $ret
}

function do_remote_cmd_with_return() {
    if [ -n "$1" ]; then
        if [ $USER == "root" ]; then
            CMD="SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} \"$1\""
        else
            CMD="echo '${PASSWORD}' | SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES}  -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} -S --prompt= \"$1\""
        fi
        OUTPUT=$(eval "$CMD")
        echo "$OUTPUT"
    else
        WARN "cmd str is null."
    fi
}

# sync local/remote file to remote/local server
# e.g. do_sync_cmd dir1 dir2 , means sync dir1 to dir2 #3060 adduser suiyuan root. mkdir /home/chipbench/workspace
function do_sync_cmd() {
    if [ -n "$1" ]; then
        CMD="SSHPASS='${PASSWORD}' rsync --rsync-path=\"rsync --no-p --no-g --chmod=ugo=rwX\"  --exclude-from=\"$CURRENT_PATH/${EXCLUDE_FILE}\"  -a --rsh=\"sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -l ${USER}\" ${1} ${2}"
        do_cmd "$CMD"
    else
        WARN "cmd str is null."
    fi
}

function do_sync_cmd_silent() {
    if [ -n "$1" ]; then
        CMD="SSHPASS='${PASSWORD}' rsync --rsync-path=\"rsync --no-p --no-g --chmod=ugo=rwX\"  --exclude-from=\"$CURRENT_PATH/${EXCLUDE_FILE}\"  -a --rsh=\"sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -l ${USER}\" ${1} ${2}"
        do_cmd "$CMD" > /dev/null 2>&1 #silent menas no output
    else
        WARN "cmd str is null."
    fi
}

function command_is_exist() {
    if ! command -v ${1} &> /dev/null
    then
        WARN "${1} could not be found. Please install ${1}."
        WARN "For Ubuntu, you can install it using: sudo apt install ${1}"
        exit 1
    else
        msg=$(sshpass -V | head -n 1)
        DEBUG "$msg"
    fi
}


function check_device_id_is_valid() {
    if [ $1 -lt $2 ]; then
        echo ${VALID}
    else
        echo ${INVALID}
    fi 
}

function get_suffix_from_type() {
    if [ "$ARCH" == "gpu" ]; then
        echo ${CUDA}
    else
        echo ${TOPS}
    fi
}

function delete_file_if_exit(){
    if [ -f "$1" ]; then
        rm -rf $1
        DEBUG "rm -rf $1"
    fi  
}

function check_file_is_exist_or_exit() {
    if [ ! -f "$1" ]; then
        WARN "File $1 not found!"
        exit 1
    fi
}

function check_dir_is_exist_or_exit() {
    if [ ! -d "$1" ]; then
        WARN "Directory $1 not found!"
        exit 1
    else
        DEBUG "Directory $1 found!"
    fi
}

function check_dir_has_files_or_exit() {
    if [ -z "$(ls -A $1)" ]; then
        WARN "Directory[$1] is empty. Exiting..."
        exit 1
    else
        DEBUG "Directory[$1] is NOT empty. continue..."
    fi
}


function docker_is_active() {
    # Check if 1 is set
    if [ -z "$1" ]; then
        WARN "CHIPBENCH_DOCKER_NAME is not set."
        exit 1
    fi

    # Check if the Docker container is running 
    CMD="${SUDO} docker ps --filter \"name=$1\" --filter \"status=running\" | awk '{print \$NF}' | grep \"$1\" | wc -l"
    DEBUG "CMD: $CMD"
    S=$(do_remote_cmd_with_return $CMD)
    DEBUG "s: $S"
    NUM_ACTIVE=$(echo "$S" | wc -l)
    DEBUG "NUM_ACTIVE: $NUM_ACTIVE"
    if [ -z $S ]; then
        echo ${ACTIVE}
    else
        echo ${INACTIVE}
    fi
}

dump_fail_result_msg(){
    # $1 error message, $2 other information
    WARN "***********************************************************************************"
    WARN "* CFG CASE   : $3"
    WARN "* FAILED"
    WARN "* $1[$2]"  
    WARN "**********************************************************************************"
}

dump_success_result_msg(){
    TRACE "*********************************************************************************" 
    TRACE "* CFG CASE   : $1"
    TRACE "* SUCCESS"
    TRACE "* log saved to $LOG_NAME"
    TRACE "*********************************************************************************"
}

dump_config_msg(){
    INFO "================================="
    INFO "case name   : $1"
    INFO "arch        : $ARCH"
    INFO "domain      : $DOMAIN"
    INFO "remote IP   : $IP"
    INFO "remote PORT : $PORT"
    INFO "remote PW   : $PASSWORD"
    INFO "remote USER : $USER"
    INFO "device id   : $DEVICE_ID"
    INFO "log name    : $LOG_NAME"
    INFO "local path  : $LOCAL_PATH"
    INFO "current path: $CURRENT_PATH"
    INFO "remote_path : $REMOTE_PATH"
}

function check_cfg_param_is_empty() {
    local ret=$ERROR_SUCCESS
    if [ -z "$DOMAIN" ] || [ -z "$PORT" ] || [ -z "$ARCH" ] || [ -z "$IP" ] || [ -z "$USER" ] ||  [ -z "$DEVICE_ID" ] || [ -z "$REMOTE_PATH" ]; then
        ret=$ERROR_CFG_PARAM_INVALID_EMPTY
    fi
    echo $ret
}

SSH_ERR="ssh connect to host $IP port $PORT: Connection refused"

# 1.Check if the build directory has files, if not, exit
BUILD_DIR="${LOCAL_PATH}/src/build"
check_dir_is_exist_or_exit $BUILD_DIR
check_dir_has_files_or_exit $BUILD_DIR
# 2. Parse IP, PASSWORD, USER from cfg file
check_file_is_exist_or_exit "$CURRENT_PATH/$CFG_FILE"

# Create tmp log dir
LOD_TMP_DIR="build_case_log"
LOG_DIR="${LOCAL_PATH}/src/${LOD_TMP_DIR}"
do_cmd_silent "test -d "${LOG_DIR}" && rm -rf "$LOG_DIR""
do_cmd_silent "mkdir -p $LOG_DIR"
DEBUG "LOG_DIR: $LOG_DIR"

UINT_LIST=$(get_cfg_id_list $CURRENT_PATH/$CFG_FILE)
UINT_NUM=$(echo "$UINT_LIST" | wc -w)
# Initialize counters
SUCCESS_COUNT=0
FAIL_COUNT=0
DEBUG "UINT_NUM: $UINT_NUM:[${UINT_LIST}]"
# Start time
start_time=$(date +%s)
for i in $UINT_LIST; do
    unset DOMAIN ARCH PORT IP PASSWORD USER DEVICE_ID LOG_NAME 
    parse_config_file $i

    # Make sure the REMOTE_PATH path is in the user directory, especially for non-root users, or rsync will fail
    REMOTE_PATH="/tmp/${DOMAIN}/chipbench"
    LOG_NAME=${i}.log
    do_cmd_silent "test -f "${LOG_NAME}" && rm -f "$LOG_NAME""

    if [ ${USER} == "root" ]; then
        SUDO=""
    fi

    # must after remote_path
    dump_config_msg $i

    # Check if the necessary parameters are empty
    ret_code=$(check_cfg_param_is_empty)
    DEBUG "check_cfg_param_is_empty ret_code: $ret_code"
    if [ $ret_code != "$ERROR_SUCCESS" ]; then
        err_str=$(error_msg $ret_code)
        dump_fail_result_msg "$err_str" "some params are null" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # Check if the IP address is valid
    ret_code=$(check_ipaddr_is_correct $IP)
    DEBUG "check_ipaddr_is_correct ret_code: $ret_code"
    if [ $ret_code != "$ERROR_SUCCESS" ]; then
        err_str=$(error_msg $ret_code)
        dump_fail_result_msg "$err_str" "$IP" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # 3. Check whether sshpass and rsync are installed
    command_is_exist sshpass
    command_is_exist rsync
    
    # 4. Check if REMOTE_PATH exists on the remote server, create it if it does not
    # do_remote_cmd "adduser ${USER} root"
    # Check if the user is not in the root group and add them to the root group if they are not
    # if ! id -nG "$USER" | grep -qw "root"; then
    #     DEBUG "User $USER is not in the root group. Adding to root group..."
    #     do_remote_cmd_silent "adduser ${USER} root"
    # else
    #     DEBUG "User $USER is already in the root group."
    # fi

    # Check ssh connection is ok
    ret_code=$(do_remote_cmd_silent "pwd" |tail -n 1)
    DEBUG "do_remote_cmd_silent ret_code: $ret_code"
    if [ $ret_code != "$ERROR_SUCCESS" ]; then
        err_str=$(error_msg $ret_code)
        dump_fail_result_msg "$err_str" "failed to connect to ${USER}:${IP}" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # Check if the user is in the root group and add them to the root group
    if [ "$USER" != "root" ]; then
        ret_code=$(do_remote_cmd_silent "adduser ${USER} root" | tail -n 1)
        DEBUG "do_remote_cmd_silent ret_code: $ret_code"
        if [ $ret_code != "$ERROR_SUCCESS" ]; then
            err_str=$(error_msg $ret_code)
            dump_fail_result_msg "$err_str" "adduser ${USER} root" "${i}"
            FAIL_COUNT=$((FAIL_COUNT + 1))
            continue
        fi
    fi

    # delete remote path /tmp/${DOMAIN}/chipbench if it exists
    ret_code=$(do_remote_cmd_silent "test -d ${REMOTE_PATH}/ && rm -rf ${REMOTE_PATH}/")

    # create remote path /tmp/${DOMAIN}/chipbench
    ret_code=$(do_remote_cmd_silent "mkdir -p ${REMOTE_PATH}/" | tail -n 1)
    DEBUG "do_remote_cmd_silent ret_code: $ret_code"
    if [ $ret_code != "$ERROR_SUCCESS" ]; then
        err_str=$(error_msg $ret_code)
        dump_fail_result_msg "$err_str" "mkdir -p ${REMOTE_PATH}" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # chown
    ret_code=$(do_remote_cmd_silent "chown -R $USER:$USER ${REMOTE_PATH}/" | tail -n 1)
    DEBUG "do_remote_cmd_silent ret_code: $ret_code"
    if [ $ret_code != "$ERROR_SUCCESS" ]; then
        err_str=$(error_msg $ret_code)
        dump_fail_result_msg "$err_str" "chown -R $USER:$USER ${REMOTE_PATH}" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # 5. Sync the current directory to the remote server
    DEBUG "sync current directory to remote server"
    if [ "$LOG_LEVEL" -ge 2 ]; then
        do_sync_cmd "${LOCAL_PATH}/" "${IP}:${REMOTE_PATH}/"
    else
        ret_code=$(do_sync_cmd_silent "${LOCAL_PATH}/" "${IP}:${REMOTE_PATH}/")
    fi

    # 6. Build the docker container
    CHIPBENCH_DOCKER_NAME=$(do_remote_cmd_with_return "${REMOTE_PATH}/docker/build_or_run.sh name $ARCH $DOMAIN")
    DEBUG "Build docker container[$CHIPBENCH_DOCKER_NAME]..."
    if [ "$FORCE_DELETE_DOCKER" ]; then
        DEBUG "Force delete the docker container[$CHIPBENCH_DOCKER_NAME]..."
        ret_code=$(do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh rm $ARCH $DOMAIN")
    fi

    if [ "$LOG_LEVEL" -ge 2 ]; then
        do_remote_cmd "${REMOTE_PATH}/docker/build_or_run.sh build $ARCH $DOMAIN"
    else
        ret_code=$(do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh build $ARCH $DOMAIN")
    fi

    # 7.if container is already running, skip the restart
    mount_cmd="${SUDO} mount -t nfs -o ro -o vers=3 10.9.231.206:/ef_Infra/devtools /home/.devtools"
    ret_code=$(do_remote_cmd_silent "${mount_cmd}")
    DOCKER_ACTIVE=$(do_remote_cmd_with_return "${REMOTE_PATH}/docker/build_or_run.sh status $ARCH $DOMAIN")
    DEBUG "docker container [$CHIPBENCH_DOCKER_NAME] status: $DOCKER_ACTIVE"
    if [ "$DOCKER_ACTIVE" == ${ACTIVE} ]; then
        DEBUG "Docker container[$CHIPBENCH_DOCKER_NAME] is already running."
    else
        DEBUG "Start the docker container[$CHIPBENCH_DOCKER_NAME]..."
        ret_code=$(do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh restart $ARCH $DOMAIN")
    fi

    # get the gpu name
    if [ "$ARCH" == "gpu" ];then
        DEVICE_NAME=$(do_remote_cmd_with_return "nvidia-smi --query-gpu name --format=noheader,csv -i ${DEVICE_ID}")
    else
        inquery_cmd="ppp| grep -E '^[| ].[$DEVICE_ID-$DEVICE_ID].[ ]'| grep -v 'C'"
        tmp_name=$(do_remote_cmd_with_return "${inquery_cmd}")
        DEVICE_NAME=$(echo "$tmp_name" | awk '{print $3}')
    fi

    # Check if the docker container env is correct
    if [ "$ARCH" == "gcu" ];then
        ret_code=$(do_remote_cmd_silent "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"topscc --version\\\"" | tail -n 1)
        if [ $ret_code != "$ERROR_SUCCESS" ]; then
            err_str=$(error_msg $ret_code)
            DEBUG "err_str: $err_str, try create env"
            WARN "${DEVICE_NAME}, ENV is not correct, try to create env, WAITTING..."
            random_str=$(date +%s%N)
            deb_path=/tmp/${random_str}
            do_remote_cmd "test ! -d ${deb_path} && ${SUDO} mkdir -p ${deb_path}"
            efgrab_cmd="source /home/.devtools/tools/env.sh && cd ${deb_path} && efgrab efml && efgrab topsplatform && chmod +x ./*.run && dpkg -i ./*.deb && ./*.run --no-auto-load -y && rm -rf ${deb_path}"
            if [ ${LOG_LEVEL} -ge 2 ]; then
                do_remote_cmd "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"${efgrab_cmd}\\\""
            else
                ret_code=$(do_remote_cmd_silent "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"${efgrab_cmd}\\\"")
            fi
            efml_so_path=$(do_remote_cmd_with_return "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"dpkg -L efml |grep  -F  libefml.so\\\"" | tail -n 1)
            efml_so_dir_path=$(dirname "$efml_so_path")
            DEBUG "efml_so_dir_path: $efml_so_dir_path"
            ret_code=$(do_remote_cmd_silent "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"cp -f ${efml_so_dir_path}/libefml.so* /usr/lib/\\\"")
        fi
    fi

    # 8. Run the test case pppp -L  | awk '/[0-9]/ {print $1}'|grep -v "-"|wc -l
    # 8.1 Check if the device id is valid
    DEBUG "Check if the device id is valid"
    if [ "$ARCH" == "gpu" ];then
        inquery_cmd=" nvidia-smi -L"
        DEVICE_INFO=$(do_remote_cmd_with_return "${inquery_cmd}")
        DEVICE_NUM=$(echo "$DEVICE_INFO" | grep -v '^\s*$' | wc -l)
    else
        inquery_cmd="pppp -L"
        DEVICE_INFO=$(do_remote_cmd_with_return "${inquery_cmd}")
        DEVICE_NUM=$(echo "$DEVICE_INFO" | awk '/[0-9]/ {print $1}'| grep -v "-"| wc -l)
    fi

    DEBUG "$DEVICE_INFO"
    DEBUG "GPU ID is :${DEVICE_ID} , GPU count is $DEVICE_NUM"
    valid_result=$(check_device_id_is_valid $DEVICE_ID $DEVICE_NUM)
    DEBUG "valid_result: $valid_result"
    if [ "$valid_result" == ${INVALID} ]; then
        err_str=$(error_msg $ERROR_CFG_PARAM_INVALID_DEVICE_ID)
        dump_fail_result_msg "$err_str" "Invalid device id $DEVICE_ID, device id should be [0,$(($DEVICE_NUM-1))]" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi
    DEBUG "Device id $DEVICE_ID is valid."
    # 8.2 Peek the remote directory, for de
    if [ "$LOG_LEVEL" -ge 2 ]; then
        do_remote_cmd "ls -la ${REMOTE_PATH}"
    fi

    # 8.3 Get the GPU Information
    if [ "$ARCH" == "gpu" ];then
        inquery_cmd="nvidia-smi"
    else
        inquery_cmd="pppp"
    fi
    DEVICE_INFO=$(do_remote_cmd_with_return "${inquery_cmd}")
    DEBUG "$DEVICE_INFO"

    # 8.4 Check the GPU processes. If there are processes running on the GPU, exit
    DEBUG "Current device id is $DEVICE_ID, name is:$DEVICE_NAME"
    # 8.4.1 Check if the GPU Name is consistent with the cfg file
    lowercase_device_name=$(echo "$DEVICE_NAME" | tr '[:upper:]' '[:lower:]' | tr '-' ' ')
    cfg_device_name=$(echo "$i" | tr '[:upper:]' '[:lower:]' | tr '_' ' ')
    DEBUG "lowercase_device_name: $lowercase_device_name"
    DEBUG "cfg_device_name: $cfg_device_name"
    do_cmd_silent "echo '$lowercase_device_name' | grep -q '$cfg_device_name'"
    if [ $? -ne 0 ]; then
        err_str=$(error_msg $ERROR_CFG_PARAM_INVALID_DEVICE_NAME)
        dump_fail_result_msg "$err_str" "Device name is $DEVICE_NAME" "${i}"
        FAIL_COUNT=$((FAIL_COUNT + 1))
        continue
    fi

    # 8.4.2 Check if the GPU is busy
    DEBUG "get the GPU process"
    if [ "$ARCH" == "gpu" ];then
        ROCESS=$(do_remote_cmd_with_return "nvidia-smi --query-compute-apps pid --format=noheader,csv -i ${DEVICE_ID}")
        PROCESS_NUM=$(echo "$PROCESS" | grep -v '^\s*$' | wc -l)
        if [ $PROCESS_NUM -gt 0 ]; then
            err_str=$(error_msg $ERROR_CURRENT_DEVICE_IS_BUSY)
            dump_fail_result_msg "$err_str" "There are $PROCESS_NUM processes running on GPU $DEVICE_NAME:${DEVICE_ID}" "${i}"
            FAIL_COUNT=$((FAIL_COUNT + 1))
            continue
        fi
    else
        gcu_info=$(do_remote_cmd_with_return "pppp --pmon -c 1 -o -i ${DEVICE_ID}")
        gcu_no_process=$(echo "$gcu_info" | grep  'no process running on' | wc -l)
        DEBUG "gcu_no_process: $gcu_no_process"
        if [ $gcu_no_process -eq 0 ]; then
            err_str=$(error_msg $ERROR_CURRENT_DEVICE_IS_BUSY)
            dump_fail_result_msg "$err_str" "There are processes running on GPU $DEVICE_NAME:${DEVICE_ID}" "${i}"
            FAIL_COUNT=$((FAIL_COUNT + 1))
            continue
        fi
    fi

    # 8.5 Run the test case
    DEBUG "No process is running on GPU $DEVICE_ID"
    if [ ${LOG_LEVEL} -ge 2 ]; then
        do_remote_cmd "docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=${DEVICE_ID} ${REMOTE_PATH}/src/report.sh ${LOG_NAME} ${ARCH}\\\""
    else
        ret_code=$(do_remote_cmd_silent "docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=${DEVICE_ID} ${REMOTE_PATH}/src/report.sh ${LOG_NAME} ${ARCH}\\\"")
    fi

    # 同步remote dir 到local dir
    DEBUG "sync report file to local"
    do_sync_cmd_silent "${IP}:${REMOTE_PATH}/" "${LOCAL_PATH}/"

    do_cmd_silent "mv -f ${i}.log ${LOG_DIR}/"

    # 9. Save the log file
    INFO "Log saved in $LOG_NAME"

    # Stop the docker container
    # WARN "Stop the docker container, [$CHIPBENCH_DOCKER_NAME]..."
    # do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh stop $ARCH $DOMAIN"

    SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
    dump_success_result_msg "${i}"
done

# End time
end_time=$(date +%s)
# Calculate and print the elapsed time
elapsed_time=$((end_time - start_time))
echo "Elapsed time: $elapsed_time seconds"
echo "All count: ${UINT_NUM}, Success count: $SUCCESS_COUNT, Fail count: $FAIL_COUNT"
echo "All done."

exit 0



原文地址:https://blog.csdn.net/weixin_43360707/article/details/143821275

免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!