SSHPASS或者rsync远程自动连接服务器并且在docker中跑脚本
背景:
一段脚本,需要在不同服务器上去跑,每次手动连接太麻烦,所以考虑用sshpas和sync来。
可以在脚本中配置多台服务器,然后自动去跑脚本。
配置文件
配置文件如下:
脚本主要通过[xxx]中的内容来解析脚本,所以不要重复里面的内容
# cant connect
[L20]
domain = private_name
arch = gpu
port = 22
ip = 1.0.0.1
password = 123456
user = root
device_id = 2
[V100S_PCIe]
domain = private_name
arch = gpu
port = 22
ip = 10.10.10.10
password = 123456
user = root
device_id = 0
详细的脚本具体如下,主要内容:
1)解析上面的脚本,分别存放在变量中
2)远程连接server
3) 同步文件
4)执行脚本
5)将结果同步回来
#!/bin/bash
# very important, otherwise the script will not work
histchars=
usage="Usage: $0 [Options]
Options:
-f Forcibly delete container
exp:
1. $0 -f
"
while getopts ':hf' opt; do
case "$opt" in
f)
FORCE_DELETE_DOCKER=true
;;
? | h)
echo "$usage"
exit 1
;;
esac
done
# Get DEBUG from environment, default to 0 if not set
LOG_LEVEL=${DEBUG:-0}
# Constants Variables
INVALID="invalid"
VALID="valid"
ACTIVE="active"
INACTIVE="inactive"
CUDA="cuda"
TOPS="tops"
RETRY_TIMES=5
# Error code
ERROR_SUCCESS=0
ERROR_CFG_PARAM_INVALID_EMPTY=1
ERROR_CFG_PARAM_INVALID_IPADDR=2
ERROR_CFG_NOT_FOUND=3
ERROR_CURRENT_DEVICE_IS_BUSY=4
ERROR_REMOTE_SSH_COMMAND_FAILED=5
ERROR_REMOTE_SSH_RSYN_FAILED=6
ERROR_CFG_PARAM_INVALID_DEVICE_ID=7
ERROR_CFG_PARAM_INVALID_DEVICE_NAME=8
# log for debug
WARN() {
local RED='\033[0;31m'
local NC='\033[0m' # No Color
local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
echo -e "${RED}[WARN ] ${TIMESTAMP}: $1${NC}"
}
DEBUG() {
if [ "$LOG_LEVEL" -ge 2 ]; then
local GREEN='\033[0;32m'
local NC='\033[0m' # No Color
local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
echo -e "${GREEN}[DEBUG] ${TIMESTAMP}: $1${NC}"
fi
}
INFO() {
if [ "$LOG_LEVEL" -ge 1 ]; then
local GREEN='\033[0;32m'
local NC='\033[0m' # No Color
local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
echo -e "${GREEN}[INFO ] ${TIMESTAMP}: $1${NC}"
fi
}
TRACE() {
local GREEN='\033[0;32m'
local NC='\033[0m' # No Color
local TIMESTAMP=$(date +"%Y-%m-%d %H:%M:%S")
echo -e "${GREEN}[TRACE] ${TIMESTAMP}: $1${NC}"
}
function error_msg() {
case $1 in
$ERROR_SUCCESS)
echo "Success"
;;
$ERROR_CFG_PARAM_INVALID_EMPTY)
echo "Invalid config params empty"
;;
$ERROR_CFG_PARAM_INVALID_IPADDR)
echo "Invalid config params ip address"
;;
$ERROR_CFG_NOT_FOUND)
echo "Config file not found"
;;
$ERROR_CURRENT_DEVICE_IS_BUSY)
echo "Current device is busy"
;;
$ERROR_REMOTE_SSH_COMMAND_FAILED)
echo "Remote ssh command failed"
;;
$ERROR_REMOTE_SSH_RSYN_FAILED)
echo "Remote rsync command failed"
;;
$ERROR_CFG_PARAM_INVALID_DEVICE_ID)
echo "Invalid config params device id"
;;
$ERROR_CFG_PARAM_INVALID_DEVICE_NAME)
echo "Invalid config params device id"
;;
*)
echo "Unknown error"
;;
esac
}
# Global variables
CURRENT_PATH=$(dirname "$(realpath "$0")")
LOCAL_PATH=$(find "$CURRENT_PATH" -type d -name "src" -exec dirname {} \; | head -n 1)
CFG_FILE="remote_cfg_template.txt"
EXCLUDE_FILE=".rsync_exclude_file"
ARCH="gpu"
SUDO="sudo"
IP="10.9.113.22"
PORT="22"
PASSWORD="123456"
USER="root"
CHIPBENCH_DOCKER_NAME="chipbenchmark.gpu"
LOG_NAME="REPORT"
DEVICE_ID="0"
REMOTE_PATH="/root"
DOMAIN="remote"
DEBUG "DEBUG: $LOG_LEVEL"
function string_trim()
{
echo "$1" | sed 's/^[[:space:]]*\(.*[^[:space:]]\)\([[:space:]]*\)$/\1/g'
}
function get_region() {
local cfg_file=$1
local user_id=$2
local ele_num=$(cat -n $cfg_file | grep "\\[.*\\]" | grep -A 1 "\\[$user_id\\]" | awk '{print $1}' | wc -l)
local lines=$(cat -n $cfg_file | grep "\\[.*\\]" | grep -A 1 "\\[$user_id\\]" | awk '{print $1}' | xargs)
if [ $ele_num -eq 1 ]; then
last_line=$(wc -l < "$cfg_file")
echo "$lines" "$last_line"
else
echo "$lines"
fi
}
function get_config() {
local cfg_file_original=$1
local user_id=$2
local cfg_name=$3
local random_str=$(date +%s%N)
local cfg_file=".tmp_cfg_file_${random_str}"
cat $cfg_file_original | grep -vE '^#|^$' > $cfg_file
local region=$(get_region $cfg_file $user_id)
local start_line=$(echo $region | awk '{print $1}')
local end_line=$(echo $region | awk '{print $2}')
string_trim $(sed -n "${start_line}, ${end_line} s/\(${cfg_name}.*=.*\)/\1/p" $cfg_file | awk -F= '{print $2}')
rm -rf $cfg_file
}
function get_cfg_id_list() {
local cfg_file=$1
local num_list=$(cat ${cfg_file} |grep -vE '^#|^$' | grep "\\[.*\\]" | grep -oP '(?<=\[).+?(?=\])'| xargs)
echo $num_list
}
function parse_config_file(){
DOMAIN=$(get_config remote_cfg_template.txt $1 domain)
PORT=$(get_config remote_cfg_template.txt $1 port)
ARCH=$(get_config remote_cfg_template.txt $1 arch)
IP=$(get_config remote_cfg_template.txt $1 ip)
PASSWORD=$(get_config remote_cfg_template.txt $1 password)
USER=$(get_config remote_cfg_template.txt $1 user)
DEVICE_ID=$(get_config remote_cfg_template.txt $1 device_id)
}
function check_ipaddr_is_correct()
{
local ret=$ERROR_SUCCESS
# Check if the string contains letters
if echo "$1" | grep -q '[a-zA-Z]'; then
ret=$ERROR_CFG_PARAM_INVALID_IPADDR
else
echo $1|grep "^[0-9]\{1,3\}\.\([0-9]\{1,3\}\.\)\{2\}[0-9]\{1,3\}$" > /dev/null;
if [ $? -ne 0 ];then
ret=$ERROR_CFG_PARAM_INVALID_IPADDR
fi
ipaddr=$1
a=`echo $ipaddr|awk -F . '{print $1}'` #Separated by ".", fetch the values for each column
b=`echo $ipaddr|awk -F . '{print $2}'`
c=`echo $ipaddr|awk -F . '{print $3}'`
d=`echo $ipaddr|awk -F . '{print $4}'`
for num in $a $b $c $d
do
if [ $num -gt 255 ] || [ $num -lt 0 ] #Each value must be between 0 and 255
then
ret=$ERROR_CFG_PARAM_INVALID_IPADDR
fi
done
fi
echo $ret
}
function do_cmd() {
if [ -n "$1" ]; then
DEBUG "$1"
eval "$1"
else
WARN "cmd str is null."
fi
}
function do_cmd_silent() {
if [ -n "$1" ]; then
DEBUG "$1"
eval "$1" > /dev/null 2>&1 #silent menas no output
else
WARN "cmd str is null."
fi
}
function do_remote_cmd() {
if [ -n "$1" ]; then
if [ $USER == "root" ]; then
CMD="SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} \"$1\""
else
CMD="echo '${PASSWORD}' | SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} -S --prompt= \"$1\""
fi
DEBUG "$CMD"
eval "$CMD"
else
WARN "cmd str is null."
fi
}
function do_remote_cmd_silent() {
local ret=$ERROR_SUCCESS
if [ -n "$1" ]; then
if [ $USER == "root" ]; then
CMD="SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} \"$1\""
else
CMD="echo '${PASSWORD}' | SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} -S --prompt= \"$1\""
fi
DEBUG "$CMD"
eval "$CMD" > /dev/null 2>&1 #silent menas no output
if [ $? -ne 0 ]; then
ret=$ERROR_REMOTE_SSH_COMMAND_FAILED
fi
else
WARN "cmd str is null."
ret=$ERROR_REMOTE_SSH_COMMAND_FAILED
fi
echo $ret
}
function do_remote_cmd_with_return() {
if [ -n "$1" ]; then
if [ $USER == "root" ]; then
CMD="SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} \"$1\""
else
CMD="echo '${PASSWORD}' | SSHPASS='${PASSWORD}' sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -o StrictHostKeyChecking=no ${USER}@${IP} ${SUDO} -S --prompt= \"$1\""
fi
OUTPUT=$(eval "$CMD")
echo "$OUTPUT"
else
WARN "cmd str is null."
fi
}
# sync local/remote file to remote/local server
# e.g. do_sync_cmd dir1 dir2 , means sync dir1 to dir2 #3060 adduser suiyuan root. mkdir /home/chipbench/workspace
function do_sync_cmd() {
if [ -n "$1" ]; then
CMD="SSHPASS='${PASSWORD}' rsync --rsync-path=\"rsync --no-p --no-g --chmod=ugo=rwX\" --exclude-from=\"$CURRENT_PATH/${EXCLUDE_FILE}\" -a --rsh=\"sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -l ${USER}\" ${1} ${2}"
do_cmd "$CMD"
else
WARN "cmd str is null."
fi
}
function do_sync_cmd_silent() {
if [ -n "$1" ]; then
CMD="SSHPASS='${PASSWORD}' rsync --rsync-path=\"rsync --no-p --no-g --chmod=ugo=rwX\" --exclude-from=\"$CURRENT_PATH/${EXCLUDE_FILE}\" -a --rsh=\"sshpass -e ssh -p ${PORT} -o ConnectTimeout=${RETRY_TIMES} -l ${USER}\" ${1} ${2}"
do_cmd "$CMD" > /dev/null 2>&1 #silent menas no output
else
WARN "cmd str is null."
fi
}
function command_is_exist() {
if ! command -v ${1} &> /dev/null
then
WARN "${1} could not be found. Please install ${1}."
WARN "For Ubuntu, you can install it using: sudo apt install ${1}"
exit 1
else
msg=$(sshpass -V | head -n 1)
DEBUG "$msg"
fi
}
function check_device_id_is_valid() {
if [ $1 -lt $2 ]; then
echo ${VALID}
else
echo ${INVALID}
fi
}
function get_suffix_from_type() {
if [ "$ARCH" == "gpu" ]; then
echo ${CUDA}
else
echo ${TOPS}
fi
}
function delete_file_if_exit(){
if [ -f "$1" ]; then
rm -rf $1
DEBUG "rm -rf $1"
fi
}
function check_file_is_exist_or_exit() {
if [ ! -f "$1" ]; then
WARN "File $1 not found!"
exit 1
fi
}
function check_dir_is_exist_or_exit() {
if [ ! -d "$1" ]; then
WARN "Directory $1 not found!"
exit 1
else
DEBUG "Directory $1 found!"
fi
}
function check_dir_has_files_or_exit() {
if [ -z "$(ls -A $1)" ]; then
WARN "Directory[$1] is empty. Exiting..."
exit 1
else
DEBUG "Directory[$1] is NOT empty. continue..."
fi
}
function docker_is_active() {
# Check if 1 is set
if [ -z "$1" ]; then
WARN "CHIPBENCH_DOCKER_NAME is not set."
exit 1
fi
# Check if the Docker container is running
CMD="${SUDO} docker ps --filter \"name=$1\" --filter \"status=running\" | awk '{print \$NF}' | grep \"$1\" | wc -l"
DEBUG "CMD: $CMD"
S=$(do_remote_cmd_with_return $CMD)
DEBUG "s: $S"
NUM_ACTIVE=$(echo "$S" | wc -l)
DEBUG "NUM_ACTIVE: $NUM_ACTIVE"
if [ -z $S ]; then
echo ${ACTIVE}
else
echo ${INACTIVE}
fi
}
dump_fail_result_msg(){
# $1 error message, $2 other information
WARN "***********************************************************************************"
WARN "* CFG CASE : $3"
WARN "* FAILED"
WARN "* $1[$2]"
WARN "**********************************************************************************"
}
dump_success_result_msg(){
TRACE "*********************************************************************************"
TRACE "* CFG CASE : $1"
TRACE "* SUCCESS"
TRACE "* log saved to $LOG_NAME"
TRACE "*********************************************************************************"
}
dump_config_msg(){
INFO "================================="
INFO "case name : $1"
INFO "arch : $ARCH"
INFO "domain : $DOMAIN"
INFO "remote IP : $IP"
INFO "remote PORT : $PORT"
INFO "remote PW : $PASSWORD"
INFO "remote USER : $USER"
INFO "device id : $DEVICE_ID"
INFO "log name : $LOG_NAME"
INFO "local path : $LOCAL_PATH"
INFO "current path: $CURRENT_PATH"
INFO "remote_path : $REMOTE_PATH"
}
function check_cfg_param_is_empty() {
local ret=$ERROR_SUCCESS
if [ -z "$DOMAIN" ] || [ -z "$PORT" ] || [ -z "$ARCH" ] || [ -z "$IP" ] || [ -z "$USER" ] || [ -z "$DEVICE_ID" ] || [ -z "$REMOTE_PATH" ]; then
ret=$ERROR_CFG_PARAM_INVALID_EMPTY
fi
echo $ret
}
SSH_ERR="ssh connect to host $IP port $PORT: Connection refused"
# 1.Check if the build directory has files, if not, exit
BUILD_DIR="${LOCAL_PATH}/src/build"
check_dir_is_exist_or_exit $BUILD_DIR
check_dir_has_files_or_exit $BUILD_DIR
# 2. Parse IP, PASSWORD, USER from cfg file
check_file_is_exist_or_exit "$CURRENT_PATH/$CFG_FILE"
# Create tmp log dir
LOD_TMP_DIR="build_case_log"
LOG_DIR="${LOCAL_PATH}/src/${LOD_TMP_DIR}"
do_cmd_silent "test -d "${LOG_DIR}" && rm -rf "$LOG_DIR""
do_cmd_silent "mkdir -p $LOG_DIR"
DEBUG "LOG_DIR: $LOG_DIR"
UINT_LIST=$(get_cfg_id_list $CURRENT_PATH/$CFG_FILE)
UINT_NUM=$(echo "$UINT_LIST" | wc -w)
# Initialize counters
SUCCESS_COUNT=0
FAIL_COUNT=0
DEBUG "UINT_NUM: $UINT_NUM:[${UINT_LIST}]"
# Start time
start_time=$(date +%s)
for i in $UINT_LIST; do
unset DOMAIN ARCH PORT IP PASSWORD USER DEVICE_ID LOG_NAME
parse_config_file $i
# Make sure the REMOTE_PATH path is in the user directory, especially for non-root users, or rsync will fail
REMOTE_PATH="/tmp/${DOMAIN}/chipbench"
LOG_NAME=${i}.log
do_cmd_silent "test -f "${LOG_NAME}" && rm -f "$LOG_NAME""
if [ ${USER} == "root" ]; then
SUDO=""
fi
# must after remote_path
dump_config_msg $i
# Check if the necessary parameters are empty
ret_code=$(check_cfg_param_is_empty)
DEBUG "check_cfg_param_is_empty ret_code: $ret_code"
if [ $ret_code != "$ERROR_SUCCESS" ]; then
err_str=$(error_msg $ret_code)
dump_fail_result_msg "$err_str" "some params are null" "${i}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
# Check if the IP address is valid
ret_code=$(check_ipaddr_is_correct $IP)
DEBUG "check_ipaddr_is_correct ret_code: $ret_code"
if [ $ret_code != "$ERROR_SUCCESS" ]; then
err_str=$(error_msg $ret_code)
dump_fail_result_msg "$err_str" "$IP" "${i}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
# 3. Check whether sshpass and rsync are installed
command_is_exist sshpass
command_is_exist rsync
# 4. Check if REMOTE_PATH exists on the remote server, create it if it does not
# do_remote_cmd "adduser ${USER} root"
# Check if the user is not in the root group and add them to the root group if they are not
# if ! id -nG "$USER" | grep -qw "root"; then
# DEBUG "User $USER is not in the root group. Adding to root group..."
# do_remote_cmd_silent "adduser ${USER} root"
# else
# DEBUG "User $USER is already in the root group."
# fi
# Check ssh connection is ok
ret_code=$(do_remote_cmd_silent "pwd" |tail -n 1)
DEBUG "do_remote_cmd_silent ret_code: $ret_code"
if [ $ret_code != "$ERROR_SUCCESS" ]; then
err_str=$(error_msg $ret_code)
dump_fail_result_msg "$err_str" "failed to connect to ${USER}:${IP}" "${i}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
# Check if the user is in the root group and add them to the root group
if [ "$USER" != "root" ]; then
ret_code=$(do_remote_cmd_silent "adduser ${USER} root" | tail -n 1)
DEBUG "do_remote_cmd_silent ret_code: $ret_code"
if [ $ret_code != "$ERROR_SUCCESS" ]; then
err_str=$(error_msg $ret_code)
dump_fail_result_msg "$err_str" "adduser ${USER} root" "${i}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
fi
# delete remote path /tmp/${DOMAIN}/chipbench if it exists
ret_code=$(do_remote_cmd_silent "test -d ${REMOTE_PATH}/ && rm -rf ${REMOTE_PATH}/")
# create remote path /tmp/${DOMAIN}/chipbench
ret_code=$(do_remote_cmd_silent "mkdir -p ${REMOTE_PATH}/" | tail -n 1)
DEBUG "do_remote_cmd_silent ret_code: $ret_code"
if [ $ret_code != "$ERROR_SUCCESS" ]; then
err_str=$(error_msg $ret_code)
dump_fail_result_msg "$err_str" "mkdir -p ${REMOTE_PATH}" "${i}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
# chown
ret_code=$(do_remote_cmd_silent "chown -R $USER:$USER ${REMOTE_PATH}/" | tail -n 1)
DEBUG "do_remote_cmd_silent ret_code: $ret_code"
if [ $ret_code != "$ERROR_SUCCESS" ]; then
err_str=$(error_msg $ret_code)
dump_fail_result_msg "$err_str" "chown -R $USER:$USER ${REMOTE_PATH}" "${i}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
# 5. Sync the current directory to the remote server
DEBUG "sync current directory to remote server"
if [ "$LOG_LEVEL" -ge 2 ]; then
do_sync_cmd "${LOCAL_PATH}/" "${IP}:${REMOTE_PATH}/"
else
ret_code=$(do_sync_cmd_silent "${LOCAL_PATH}/" "${IP}:${REMOTE_PATH}/")
fi
# 6. Build the docker container
CHIPBENCH_DOCKER_NAME=$(do_remote_cmd_with_return "${REMOTE_PATH}/docker/build_or_run.sh name $ARCH $DOMAIN")
DEBUG "Build docker container[$CHIPBENCH_DOCKER_NAME]..."
if [ "$FORCE_DELETE_DOCKER" ]; then
DEBUG "Force delete the docker container[$CHIPBENCH_DOCKER_NAME]..."
ret_code=$(do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh rm $ARCH $DOMAIN")
fi
if [ "$LOG_LEVEL" -ge 2 ]; then
do_remote_cmd "${REMOTE_PATH}/docker/build_or_run.sh build $ARCH $DOMAIN"
else
ret_code=$(do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh build $ARCH $DOMAIN")
fi
# 7.if container is already running, skip the restart
mount_cmd="${SUDO} mount -t nfs -o ro -o vers=3 10.9.231.206:/ef_Infra/devtools /home/.devtools"
ret_code=$(do_remote_cmd_silent "${mount_cmd}")
DOCKER_ACTIVE=$(do_remote_cmd_with_return "${REMOTE_PATH}/docker/build_or_run.sh status $ARCH $DOMAIN")
DEBUG "docker container [$CHIPBENCH_DOCKER_NAME] status: $DOCKER_ACTIVE"
if [ "$DOCKER_ACTIVE" == ${ACTIVE} ]; then
DEBUG "Docker container[$CHIPBENCH_DOCKER_NAME] is already running."
else
DEBUG "Start the docker container[$CHIPBENCH_DOCKER_NAME]..."
ret_code=$(do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh restart $ARCH $DOMAIN")
fi
# get the gpu name
if [ "$ARCH" == "gpu" ];then
DEVICE_NAME=$(do_remote_cmd_with_return "nvidia-smi --query-gpu name --format=noheader,csv -i ${DEVICE_ID}")
else
inquery_cmd="ppp| grep -E '^[| ].[$DEVICE_ID-$DEVICE_ID].[ ]'| grep -v 'C'"
tmp_name=$(do_remote_cmd_with_return "${inquery_cmd}")
DEVICE_NAME=$(echo "$tmp_name" | awk '{print $3}')
fi
# Check if the docker container env is correct
if [ "$ARCH" == "gcu" ];then
ret_code=$(do_remote_cmd_silent "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"topscc --version\\\"" | tail -n 1)
if [ $ret_code != "$ERROR_SUCCESS" ]; then
err_str=$(error_msg $ret_code)
DEBUG "err_str: $err_str, try create env"
WARN "${DEVICE_NAME}, ENV is not correct, try to create env, WAITTING..."
random_str=$(date +%s%N)
deb_path=/tmp/${random_str}
do_remote_cmd "test ! -d ${deb_path} && ${SUDO} mkdir -p ${deb_path}"
efgrab_cmd="source /home/.devtools/tools/env.sh && cd ${deb_path} && efgrab efml && efgrab topsplatform && chmod +x ./*.run && dpkg -i ./*.deb && ./*.run --no-auto-load -y && rm -rf ${deb_path}"
if [ ${LOG_LEVEL} -ge 2 ]; then
do_remote_cmd "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"${efgrab_cmd}\\\""
else
ret_code=$(do_remote_cmd_silent "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"${efgrab_cmd}\\\"")
fi
efml_so_path=$(do_remote_cmd_with_return "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"dpkg -L efml |grep -F libefml.so\\\"" | tail -n 1)
efml_so_dir_path=$(dirname "$efml_so_path")
DEBUG "efml_so_dir_path: $efml_so_dir_path"
ret_code=$(do_remote_cmd_silent "${SUDO} docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"cp -f ${efml_so_dir_path}/libefml.so* /usr/lib/\\\"")
fi
fi
# 8. Run the test case pppp -L | awk '/[0-9]/ {print $1}'|grep -v "-"|wc -l
# 8.1 Check if the device id is valid
DEBUG "Check if the device id is valid"
if [ "$ARCH" == "gpu" ];then
inquery_cmd=" nvidia-smi -L"
DEVICE_INFO=$(do_remote_cmd_with_return "${inquery_cmd}")
DEVICE_NUM=$(echo "$DEVICE_INFO" | grep -v '^\s*$' | wc -l)
else
inquery_cmd="pppp -L"
DEVICE_INFO=$(do_remote_cmd_with_return "${inquery_cmd}")
DEVICE_NUM=$(echo "$DEVICE_INFO" | awk '/[0-9]/ {print $1}'| grep -v "-"| wc -l)
fi
DEBUG "$DEVICE_INFO"
DEBUG "GPU ID is :${DEVICE_ID} , GPU count is $DEVICE_NUM"
valid_result=$(check_device_id_is_valid $DEVICE_ID $DEVICE_NUM)
DEBUG "valid_result: $valid_result"
if [ "$valid_result" == ${INVALID} ]; then
err_str=$(error_msg $ERROR_CFG_PARAM_INVALID_DEVICE_ID)
dump_fail_result_msg "$err_str" "Invalid device id $DEVICE_ID, device id should be [0,$(($DEVICE_NUM-1))]" "${i}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
DEBUG "Device id $DEVICE_ID is valid."
# 8.2 Peek the remote directory, for de
if [ "$LOG_LEVEL" -ge 2 ]; then
do_remote_cmd "ls -la ${REMOTE_PATH}"
fi
# 8.3 Get the GPU Information
if [ "$ARCH" == "gpu" ];then
inquery_cmd="nvidia-smi"
else
inquery_cmd="pppp"
fi
DEVICE_INFO=$(do_remote_cmd_with_return "${inquery_cmd}")
DEBUG "$DEVICE_INFO"
# 8.4 Check the GPU processes. If there are processes running on the GPU, exit
DEBUG "Current device id is $DEVICE_ID, name is:$DEVICE_NAME"
# 8.4.1 Check if the GPU Name is consistent with the cfg file
lowercase_device_name=$(echo "$DEVICE_NAME" | tr '[:upper:]' '[:lower:]' | tr '-' ' ')
cfg_device_name=$(echo "$i" | tr '[:upper:]' '[:lower:]' | tr '_' ' ')
DEBUG "lowercase_device_name: $lowercase_device_name"
DEBUG "cfg_device_name: $cfg_device_name"
do_cmd_silent "echo '$lowercase_device_name' | grep -q '$cfg_device_name'"
if [ $? -ne 0 ]; then
err_str=$(error_msg $ERROR_CFG_PARAM_INVALID_DEVICE_NAME)
dump_fail_result_msg "$err_str" "Device name is $DEVICE_NAME" "${i}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
# 8.4.2 Check if the GPU is busy
DEBUG "get the GPU process"
if [ "$ARCH" == "gpu" ];then
ROCESS=$(do_remote_cmd_with_return "nvidia-smi --query-compute-apps pid --format=noheader,csv -i ${DEVICE_ID}")
PROCESS_NUM=$(echo "$PROCESS" | grep -v '^\s*$' | wc -l)
if [ $PROCESS_NUM -gt 0 ]; then
err_str=$(error_msg $ERROR_CURRENT_DEVICE_IS_BUSY)
dump_fail_result_msg "$err_str" "There are $PROCESS_NUM processes running on GPU $DEVICE_NAME:${DEVICE_ID}" "${i}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
else
gcu_info=$(do_remote_cmd_with_return "pppp --pmon -c 1 -o -i ${DEVICE_ID}")
gcu_no_process=$(echo "$gcu_info" | grep 'no process running on' | wc -l)
DEBUG "gcu_no_process: $gcu_no_process"
if [ $gcu_no_process -eq 0 ]; then
err_str=$(error_msg $ERROR_CURRENT_DEVICE_IS_BUSY)
dump_fail_result_msg "$err_str" "There are processes running on GPU $DEVICE_NAME:${DEVICE_ID}" "${i}"
FAIL_COUNT=$((FAIL_COUNT + 1))
continue
fi
fi
# 8.5 Run the test case
DEBUG "No process is running on GPU $DEVICE_ID"
if [ ${LOG_LEVEL} -ge 2 ]; then
do_remote_cmd "docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=${DEVICE_ID} ${REMOTE_PATH}/src/report.sh ${LOG_NAME} ${ARCH}\\\""
else
ret_code=$(do_remote_cmd_silent "docker exec $CHIPBENCH_DOCKER_NAME /bin/bash -c \\\"CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=${DEVICE_ID} ${REMOTE_PATH}/src/report.sh ${LOG_NAME} ${ARCH}\\\"")
fi
# 同步remote dir 到local dir
DEBUG "sync report file to local"
do_sync_cmd_silent "${IP}:${REMOTE_PATH}/" "${LOCAL_PATH}/"
do_cmd_silent "mv -f ${i}.log ${LOG_DIR}/"
# 9. Save the log file
INFO "Log saved in $LOG_NAME"
# Stop the docker container
# WARN "Stop the docker container, [$CHIPBENCH_DOCKER_NAME]..."
# do_remote_cmd_silent "${REMOTE_PATH}/docker/build_or_run.sh stop $ARCH $DOMAIN"
SUCCESS_COUNT=$((SUCCESS_COUNT + 1))
dump_success_result_msg "${i}"
done
# End time
end_time=$(date +%s)
# Calculate and print the elapsed time
elapsed_time=$((end_time - start_time))
echo "Elapsed time: $elapsed_time seconds"
echo "All count: ${UINT_NUM}, Success count: $SUCCESS_COUNT, Fail count: $FAIL_COUNT"
echo "All done."
exit 0
原文地址:https://blog.csdn.net/weixin_43360707/article/details/143821275
免责声明:本站文章内容转载自网络资源,如本站内容侵犯了原著者的合法权益,可联系本站删除。更多内容请关注自学内容网(zxcms.com)!