From fa4d0fd1ef6946e80c8f18d02a6a38ca1bffb361 Mon Sep 17 00:00:00 2001 From: Michael Melesse Date: Mon, 17 Oct 2022 17:28:48 +0000 Subject: [PATCH] add scripts --- scripts/amd/backtrace.sh | 14 +++ scripts/amd/build.sh | 7 ++ scripts/amd/check_llvm_src.sh | 3 + scripts/amd/clean.sh | 27 ++++++ scripts/amd/collect_rocm_kernels.sh | 11 +++ scripts/amd/debug.sh | 13 +++ scripts/amd/delete_hip_files.sh | 2 + scripts/amd/deps.sh | 3 + scripts/amd/docker_build.sh | 16 ++++ scripts/amd/docker_run.sh | 29 ++++++ scripts/amd/find_lib.sh | 2 + scripts/amd/git_config_user.sh | 7 ++ scripts/amd/git_submodule_add.sh | 1 + scripts/amd/git_submodule_rm.sh | 10 +++ scripts/amd/git_submodule_update.sh | 6 ++ scripts/amd/grep_for_symbol.sh | 18 ++++ scripts/amd/hipify.sh | 1 + scripts/amd/lld.sh | 1 + scripts/amd/post.sh | 2 + scripts/amd/run.sh | 18 ++++ scripts/amd/test.sh | 133 ++++++++++++++++++++++++++++ scripts/amd/test_fptrunc.py | 62 +++++++++++++ 22 files changed, 386 insertions(+) create mode 100644 scripts/amd/backtrace.sh create mode 100755 scripts/amd/build.sh create mode 100644 scripts/amd/check_llvm_src.sh create mode 100644 scripts/amd/clean.sh create mode 100644 scripts/amd/collect_rocm_kernels.sh create mode 100644 scripts/amd/debug.sh create mode 100644 scripts/amd/delete_hip_files.sh create mode 100644 scripts/amd/deps.sh create mode 100644 scripts/amd/docker_build.sh create mode 100755 scripts/amd/docker_run.sh create mode 100644 scripts/amd/find_lib.sh create mode 100644 scripts/amd/git_config_user.sh create mode 100644 scripts/amd/git_submodule_add.sh create mode 100644 scripts/amd/git_submodule_rm.sh create mode 100644 scripts/amd/git_submodule_update.sh create mode 100644 scripts/amd/grep_for_symbol.sh create mode 100644 scripts/amd/hipify.sh create mode 100644 scripts/amd/lld.sh create mode 100644 scripts/amd/post.sh create mode 100644 scripts/amd/run.sh create mode 100755 scripts/amd/test.sh create mode 100644 scripts/amd/test_fptrunc.py diff --git a/scripts/amd/backtrace.sh b/scripts/amd/backtrace.sh new file mode 100644 index 000000000..d68db532d --- /dev/null +++ b/scripts/amd/backtrace.sh @@ -0,0 +1,14 @@ +sudo apt install gdb -y + +# export AMD_OCL_WAIT_COMMAND=1 +# export AMD_LOG_LEVEL=3 +# export HIP_LAUNCH_BLOCKING=1 + + +gdb -ex "set pagination off" \ + -ex "file python" \ + -ex 'run -m pytest --capture=tee-sys --verbose "python/test/unit/language/test_core.py::test_program_id[float16]"' \ + -ex "backtrace" \ + -ex "set confirm off" \ + -ex "q" \ + 2>&1 | tee /dockerx/pytorch/test_core_gdb.log diff --git a/scripts/amd/build.sh b/scripts/amd/build.sh new file mode 100755 index 000000000..962e46775 --- /dev/null +++ b/scripts/amd/build.sh @@ -0,0 +1,7 @@ +set -e +cd python +pip uninstall -y triton +# export TRITON_USE_ROCM=ON + +export TRITON_ROCM_DEBUG=ON +pip install --verbose -e . \ No newline at end of file diff --git a/scripts/amd/check_llvm_src.sh b/scripts/amd/check_llvm_src.sh new file mode 100644 index 000000000..a54e58f33 --- /dev/null +++ b/scripts/amd/check_llvm_src.sh @@ -0,0 +1,3 @@ +shopt -s extglob +/opt/rocm/llvm/bin/llc -mcpu=gfx908 triton_rocm_kernels/*+([0-9]).ll +# /opt/rocm/llvm/bin/llc -mcpu=gfx908 triton_rocm_kernels/*_before_verify.ll \ No newline at end of file diff --git a/scripts/amd/clean.sh b/scripts/amd/clean.sh new file mode 100644 index 000000000..9bc7c4b2a --- /dev/null +++ b/scripts/amd/clean.sh @@ -0,0 +1,27 @@ +set -x +rm -rf core +rm -rf ptx.hip +rm -rf python/build/ +rm -rf python/test/__pycache__/ +rm -rf python/triton.egg-info/ +rm -rf python/triton/_C/libtriton.so +rm -rf python/triton/__pycache__/ +rm -rf python/triton/ops/__pycache__/ +rm -rf python/triton/ops/blocksparse/__pycache__/ +rm -rf *.isa +rm -rf *.gcn +rm -rf *.ptx +rm -rf *.ll +rm -rf *.s +rm -rf *.o +rm -rf *.hsaco +rm -rf *.ttir +sh scripts/amd/delete_hip_files.sh +rm -rf triton_rocm_kernels +rm -rf /tmp/*.ll +rm -rf /tmp/*.gcn +rm -rf /tmp/*.hsaco +rm -rf /tmp/*.o +rm -rf /tmp/*.ttir +rm -rf /tmp/*.s +rm -rf build \ No newline at end of file diff --git a/scripts/amd/collect_rocm_kernels.sh b/scripts/amd/collect_rocm_kernels.sh new file mode 100644 index 000000000..331558155 --- /dev/null +++ b/scripts/amd/collect_rocm_kernels.sh @@ -0,0 +1,11 @@ +# COPY kernels +DIRNAME=triton_rocm_kernels +rm -rf $DIRNAME +mkdir $DIRNAME +mv /tmp/*.ttir $DIRNAME +mv /tmp/*.ll $DIRNAME +mv /tmp/*.gcn $DIRNAME +mv /tmp/*.o $DIRNAME +mv /tmp/*.hsaco $DIRNAME +mv /tmp/*.s $DIRNAME +chmod -R 777 $DIRNAME diff --git a/scripts/amd/debug.sh b/scripts/amd/debug.sh new file mode 100644 index 000000000..45cd57ba0 --- /dev/null +++ b/scripts/amd/debug.sh @@ -0,0 +1,13 @@ +sudo apt install gdb -y + +# export AMD_OCL_WAIT_COMMAND=1 +# export AMD_LOG_LEVEL=3 +# export HIP_LAUNCH_BLOCKING=1 + +gdb -ex "set pagination off" \ + -ex "file python" \ + -ex "set confirm off" \ + -ex "break 1" \ + -ex 'run -m pytest --capture=tee-sys --verbose "python/test/unit/language/test_core.py::test_load_and_store_op[float32-2]"' \ + -ex "q" \ + 2>&1 | tee /dockerx/pytorch/test_core_gdb.log diff --git a/scripts/amd/delete_hip_files.sh b/scripts/amd/delete_hip_files.sh new file mode 100644 index 000000000..b6b5abe5c --- /dev/null +++ b/scripts/amd/delete_hip_files.sh @@ -0,0 +1,2 @@ +# find . -name '*hip.h' -delete +find . -name '*_hip.*' -delete \ No newline at end of file diff --git a/scripts/amd/deps.sh b/scripts/amd/deps.sh new file mode 100644 index 000000000..d7f2970cc --- /dev/null +++ b/scripts/amd/deps.sh @@ -0,0 +1,3 @@ +sudo apt update +sudo apt install libtinfo-dev gdb +# sudo apt install llvm-11 # install on cuda \ No newline at end of file diff --git a/scripts/amd/docker_build.sh b/scripts/amd/docker_build.sh new file mode 100644 index 000000000..0395ac0e2 --- /dev/null +++ b/scripts/amd/docker_build.sh @@ -0,0 +1,16 @@ +# print every command +set -o xtrace + +# set path +# DOCKERFILE_PATH=scripts/docker/Dockerfile.triton_rocm +# DOCKERFILE_PATH=scripts/docker/Dockerfile.triton_cuda +# DOCKERFILE_PATH=triton_rocm_all_archs.Dockerfile +DOCKERFILE_PATH=triton_rocm_20-52.Dockerfile + +# get tag +DOCKERFILE_NAME=$(basename $DOCKERFILE_PATH) +DOCKERIMAGE_NAME=$(echo "$DOCKERFILE_NAME" | cut -f -1 -d '.') +echo $DOCKERIMAGE_NAME + +# build docker +docker build --build-arg CACHEBUST=$(date +%s) -f $DOCKERFILE_PATH -t $DOCKERIMAGE_NAME . diff --git a/scripts/amd/docker_run.sh b/scripts/amd/docker_run.sh new file mode 100755 index 000000000..63a34fa9c --- /dev/null +++ b/scripts/amd/docker_run.sh @@ -0,0 +1,29 @@ +set -o xtrace + +alias drun='sudo docker run -it --rm --network=host --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined' + +# DEVICES="--gpus all" +DEVICES="--device=/dev/kfd --device=/dev/dri" + +MEMORY="--ipc=host --shm-size 16G" + +VOLUMES="-v $HOME/dockerx:/dockerx -v /data:/data" + +# WORK_DIR='/root/$(basename $(pwd))' +WORK_DIR="/dockerx/$(basename $(pwd))" + +# IMAGE_NAME=nvcr.io/nvidia/pytorch:21.08-py3 +# IMAGE_NAME=rocm/pytorch # latest doesnot work +# IMAGE_NAME=rocm/pytorch:rocm4.3.1_ubuntu18.04_py3.6_pytorch_1.10.0 +IMAGE_NAME=triton_rocm_20-52 # build this docker before running + +CONTAINER_NAME=triton + +# start new container +CONTAINER_ID=$(drun -d -w $WORK_DIR --name $CONTAINER_NAME $MEMORY $VOLUMES $DEVICES $IMAGE_NAME) +echo "CONTAINER_ID: $CONTAINER_ID" +# docker cp . $CONTAINER_ID:$WORK_DIR +# docker exec $CONTAINER_ID bash -c "bash scripts/amd/run.sh" +docker attach $CONTAINER_ID +docker stop $CONTAINER_ID +docker rm $CONTAINER_ID diff --git a/scripts/amd/find_lib.sh b/scripts/amd/find_lib.sh new file mode 100644 index 000000000..5c1c3240e --- /dev/null +++ b/scripts/amd/find_lib.sh @@ -0,0 +1,2 @@ +LIB_NAME=libtinfow +ldconfig -p | grep $LIB_NAME \ No newline at end of file diff --git a/scripts/amd/git_config_user.sh b/scripts/amd/git_config_user.sh new file mode 100644 index 000000000..d1063202e --- /dev/null +++ b/scripts/amd/git_config_user.sh @@ -0,0 +1,7 @@ +# use --global flag if you want to set it for whole machine +git config user.name "Michael Melesse" +git config user.email "micmelesse@gmail.com" + +# unset with +# git config --global --unset-all user.name +# git config --global --unset-all user.email \ No newline at end of file diff --git a/scripts/amd/git_submodule_add.sh b/scripts/amd/git_submodule_add.sh new file mode 100644 index 000000000..07c971c47 --- /dev/null +++ b/scripts/amd/git_submodule_add.sh @@ -0,0 +1 @@ +git submodule add https://github.com/ROCmSoftwarePlatform/hipify-torch third_party/hipify-torch \ No newline at end of file diff --git a/scripts/amd/git_submodule_rm.sh b/scripts/amd/git_submodule_rm.sh new file mode 100644 index 000000000..e42d6bd4a --- /dev/null +++ b/scripts/amd/git_submodule_rm.sh @@ -0,0 +1,10 @@ +SUBMODULE=third_party/hipify-torch + +# Remove the submodule entry from .git/config +git submodule deinit -f $SUBMODULE + +# Remove the submodule directory from the superproject's .git/modules directory +rm -rf .git/modules/$SUBMODULE + +# Remove the entry in .gitmodules and remove the submodule directory located at path/to/submodule +git rm -f $SUBMODULE \ No newline at end of file diff --git a/scripts/amd/git_submodule_update.sh b/scripts/amd/git_submodule_update.sh new file mode 100644 index 000000000..431c18606 --- /dev/null +++ b/scripts/amd/git_submodule_update.sh @@ -0,0 +1,6 @@ +# if you are updating an existing checkout +git submodule sync +git submodule update --init --recursive + +# if you want to push every to tip +# git submodule update --init --recursive --remote diff --git a/scripts/amd/grep_for_symbol.sh b/scripts/amd/grep_for_symbol.sh new file mode 100644 index 000000000..aae3d88b0 --- /dev/null +++ b/scripts/amd/grep_for_symbol.sh @@ -0,0 +1,18 @@ +# SYMBOL=_ZN4llvm11PassBuilder17OptimizationLevel2O0E +# SYMBOL=_ZN4llvm11DDGAnalysis3KeyE +# SYMBOL=_ZN4llvm26UnifyFunctionExitNodesPass3runERNS_8FunctionERNS_15AnalysisManagerIS1_JEEE +# SYMBOL=_ZN4llvm12LoopFusePass3runERNS_8FunctionERNS_15AnalysisManagerIS1_JEEE +# SYMBOL=_ZN4llvm30moveInstructionsToTheBeginningERNS_10BasicBlockES1_RNS_13DominatorTreeERKNS_17PostDominatorTreeERNS_14DependenceInfoE +# SYMBOL=_ZN4llvm17LoopExtractorPass3runERNS_6ModuleERNS_15AnalysisManagerIS1_JEEE +# SYMBOL=_ZN4llvm17ObjCARCExpandPass3runERNS_8FunctionERNS_15AnalysisManagerIS1_JEEE +# SYMBOL=_ZN4llvm13CoroSplitPass3runERNS_13LazyCallGraph3SCCERNS_15AnalysisManagerIS2_JRS1_EEES5_RNS_17CGSCCUpdateResultE +SYMBOL=_ZN4llvm20SyntheticCountsUtilsIPKNS_9CallGraphEE9propagateERKS3_NS_12function_refIFNS_8OptionalINS_12ScaledNumberImEEEEPKNS_13CallGraphNodeERKSt4pairINS8_INS_14WeakTrackingVHEEEPSC_EEEENS7_IFvSE_SA_EEE +for lib in $(find /tmp/clang+llvm-13.0.0-x86_64-linux-gnu-ubuntu-16.04/ -name \*.a); do + symbols=$(nm $lib | grep $SYMBOL | grep -v " U ") + + if [ "${#symbols}" -gt "0" ]; then + echo $lib + echo $symbols + fi + +done diff --git a/scripts/amd/hipify.sh b/scripts/amd/hipify.sh new file mode 100644 index 000000000..474659456 --- /dev/null +++ b/scripts/amd/hipify.sh @@ -0,0 +1 @@ +PYTHONDONTWRITEBYTECODE=1 python3 third_party/hipify-torch/hipify_cli.py --project-directory . diff --git a/scripts/amd/lld.sh b/scripts/amd/lld.sh new file mode 100644 index 000000000..f3542efa8 --- /dev/null +++ b/scripts/amd/lld.sh @@ -0,0 +1 @@ +/opt/rocm/llvm/bin/ld.lld -flavor gnu -shared _empty.o -o _empty.hsaco diff --git a/scripts/amd/post.sh b/scripts/amd/post.sh new file mode 100644 index 000000000..8c40f18d0 --- /dev/null +++ b/scripts/amd/post.sh @@ -0,0 +1,2 @@ +bash scripts/amd/collect_rocm_kernels.sh +bash scripts/amd/check_llvm_src.sh \ No newline at end of file diff --git a/scripts/amd/run.sh b/scripts/amd/run.sh new file mode 100644 index 000000000..d396b733a --- /dev/null +++ b/scripts/amd/run.sh @@ -0,0 +1,18 @@ +clear + +ROOT_DIR=$(pwd) +LOG_DIR=$ROOT_DIR/log_$(git rev-parse --symbolic-full-name --abbrev-ref HEAD) +rm -rf $LOG_DIR +mkdir -p $LOG_DIR +chmod -R 777 $LOG_DIR + + +bash scripts/amd/clean.sh +# bash scripts/amd/deps.sh +bash scripts/amd/build.sh +bash scripts/amd/test.sh 2>&1 |tee $LOG_DIR/test.log +# bash scripts/amd/debug.sh +# bash scripts/amd/backtrace.sh 2>&1 |tee $LOG_DIR/backtrace.log + + +bash scripts/amd/post.sh # dont double call \ No newline at end of file diff --git a/scripts/amd/test.sh b/scripts/amd/test.sh new file mode 100755 index 000000000..65901c627 --- /dev/null +++ b/scripts/amd/test.sh @@ -0,0 +1,133 @@ +# clear +rm -rf triton_rocm_kernels + +# export TRITON_LIBHIP=/opt/rocm/lib/libamdhip64.so + +# export AMD_OCL_WAIT_COMMAND=1 +# export AMD_LOG_LEVEL=3 +# export HIP_LAUNCH_BLOCKING=1 + +# remove cache to avoid segfaults +# TODO: inform triton dev the cache cause segfault +rm -rf /tmp/triton + +# pytest python/test +# pytest python/test/test_blocksparse.py + +# pytest --verbose python/test/test_conv.py +# pytest --verbose python/test/test_blocksparse.py::test_matmul[sdd-False-False-16-float16] +# pytest --verbose python/test/test_blocksparse.py::test_attention_fwd_bwd +# python python/test/test_conv.py + +# gdb -ex "set breakpoint pending on" \ +# -ex 'break add_passes_to_emit_bin' \ +# --args python python/test/test_add.py + +# python python/test/test_empty.py +# -ex 'ignore 1 472' \ + +pytest --verbose python/test/unit/language/test_core.py 2>&1 | tee /dockerx/triton/test_core.log + +# pytest --capture=tee-sys --verbose python/test/regression/test_performance.py | tee /dockerx/triton/test_performance.log +# pytest --capture=tee-sys --verbose python/test/regression/test_performance.py::test_matmul | tee /dockerx/triton/test_performance_matmul.log +# pytest --capture=tee-sys --verbose python/test/regression/test_performance.py::test_elementwise | tee /dockerx/triton/test_performance_elementwise.log + +# pytest --capture=tee-sys --verbose python/test/regression/test_performance.py::test_matmul[256-256-256] + +# pytest --capture=tee-sys --verbose python/test/unit/language/test_core.py::test_empty_kernel[float32] +# pytest --verbose python/test/unit/language/test_core.py::test_load_and_store_op[float32-2] +# pytest --capture=tee-sys --verbose python/test/unit/language/test_core.py::test_load_and_store_op_with_mask +# pytest --verbose python/test/unit/language/test_core.py::test_program_id[float32] +# pytest --capture=tee-sys --verbose python/test/unit/language/test_core.py::test_num_programs[float32] +# pytest --verbose python/test/unit/language/test_core.py::test_unary_op +# pytest --verbose python/test/unit/language/test_core.py::test_bin_op +# pytest --verbose "python/test/unit/language/test_core.py::test_dot" +# pytest --verbose python/test/unit/language/test_core.py::test_cast +# pytest --verbose python/test/unit/language/test_core.py::test_reduce1d +# pytest --verbose python/test/unit/language/test_core.py::test_reduce2d +# pytest --verbose python/test/unit/language/test_core.py::test_math_op +# pytest --capture=tee-sys --verbose python/test/unit/language/test_core.py::test_atomic_rmw +# pytest --verbose python/test/unit/operators/test_blocksparse.py::test_matmul +# pytest --verbose python/test/unit/operators/test_blocksparse.py::test_matmul[DTYPE0-16-False-False-dds] +# pytest --verbose python/test/unit/operators/test_blocksparse.py::test_matmul[DTYPE0-64-False-False-dds] +# pytest --capture=tee-sys --verbose python/test/unit/language/test_core.py::test_matmul + +# pytest --capture=tee-sys --verbose python/test/unit/language/test_core.py::test_load_and_store_op_with_mask +# pytest --capture=tee-sys --verbose "python/test/unit/language/test_core.py::test_masked_load_shared_memory" +# pytest --verbose "python/test/unit/operators/test_blocksparse.py::test_softmax[DTYPE0-256-16]" +# pytest --verbose "python/test/unit/operators/test_blocksparse.py::test_softmax" #|& tee /dockerx/triton/test_softmax.log + + +# pytest --verbose "python/test/unit/operators/test_blocksparse.py::test_softmax[DTYPE0-1024-16]" # PASSED [ 29%] +# pytest --verbose "python/test/unit/operators/test_blocksparse.py::test_softmax[DTYPE0-1024-32]" # FAILED +# pytest --verbose python/test/unit/language/test_core.py::test_permute +# pytest --verbose python/test/unit/language/test_core.py::test_load_cache_modifier + +# pytest --verbose python/test/unit/language/test_core.py::test_math_op[log] +# pytest --capture=tee-sys --verbose python/test/unit/language/test_core.py::test_load_and_store_op[float64] +# pytest --verbose "python/test/unit/language/test_core.py::test_bin_op[int8-int64- x % y]" +# pytest --verbose "python/test/unit/language/test_core.py::test_dot[none]" |& tee /dockerx/triton/test_dot_none.log +# pytest --verbose "python/test/unit/language/test_core.py::test_dot[add-rows]" +# pytest --verbose "python/test/unit/language/test_core.py::test_dot[add-cols]" +# pytest --verbose "python/test/unit/language/test_core.py::test_cast[float32-float16-False]" + +# pytest --verbose python/test/unit/operators/test_blocksparse.py::test_matmul[DTYPE0-32-False-False-sdd] +# pytest --capture=tee-sys --verbose python/test/unit/operators/test_blocksparse.py::test_softmax[DTYPE0-256-32] + +# pytest --verbose python/test/unit/operators/test_blocksparse.py +# pytest --verbose python/test/unit/operators/test_blocksparse.py::test_matmul[DTYPE0-32-False-False-sdd] +# pytest --verbose scripts/amd/test_fptrunc.py +# pytest --verbose scripts/amd/test_fptrunc.py::test_fptrunc[float32-float32-False] +# pytest --verbose "python/test/unit/language/test_core.py::test_cast" +# pytest --verbose "python/test/unit/language/test_core.py::test_cast[float32-float16-False]" +# pytest --verbose "python/test/unit/language/test_core.py::test_cast[float32-bfloat16-False]" +# python python/test/unit/language/test_core.py + +# pytest --capture=tee-sys --verbose python/test/unit/language/test_core.py::test_empty_kernel + +# pytest --capture=tee-sys --verbose "python/test/unit/language/test_core.py::test_bin_op[int8-int64- x % y]" +# pytest --capture=tee-sys --verbose "python/test/unit/language/test_core.py::test_bin_op[int8-float32- x % y]" +# pytest --capture=tee-sys --verbose "python/test/unit/language/test_core.py::test_bin_op[int8-float16- x % y]" +# pytest --capture=tee-sys --verbose "python/test/unit/language/test_core.py::test_bin_op[float32-float64- x % y]" +# pytest --capture=tee-sys --verbose "python/test/unit/language/test_core.py::test_math_op[exp]" +# pytest --verbose "python/test/unit/operators/test_blocksparse.py" +# pytest --capture=tee-sys --verbose "python/test/unit/operators/test_blocksparse.py::test_matmul[sdd-False-False-16-float16]" + +# pytest --capture=tee-sys --verbose "python/test/unit/language/test_core.py::test_arange" + +# pytest --verbose "python/test/unit/language/test_core.py::test_masked_load_shared_memory" +# pytest --verbose "python/test/unit/language/test_core.py::test_dot_without_load" +# pytest --verbose "python/test/unit/language/test_core.py::test_fmadot" + +# FAILING TESTS +# pytest --verbose "python/test/unit/language/test_core.py::test_bin_op[int8-float16- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int8-float32- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int8-float64- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int16-float16- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int16-float32- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int16-float64- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int32-float16- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int32-float32- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int32-float64- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int64-float16- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int64-float32- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[int64-float64- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float16-int8- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float16-int16- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float16-int32- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float16-int64- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float16-float64- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float32-int8- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float32-int16- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float32-int32- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float32-int64- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float32-float64- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float64-int8- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float64-int16- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float64-int32- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float64-int64- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float64-float16- x % y]" \ +# "python/test/unit/language/test_core.py::test_bin_op[float64-float32- x % y]" + +# do post test steps +# bash scripts/amd/post.sh # it should be run in the run script diff --git a/scripts/amd/test_fptrunc.py b/scripts/amd/test_fptrunc.py new file mode 100644 index 000000000..932ab2942 --- /dev/null +++ b/scripts/amd/test_fptrunc.py @@ -0,0 +1,62 @@ +import torch +import triton +import triton.language as tl +import pytest + +cvt = { + 'bool': torch.bool, + 'int8': torch.int8, + 'int16': torch.int16, + 'int32': torch.int32, + 'int64': torch.int64, + 'bfloat16': torch.bfloat16, + 'float16': torch.float16, + 'float32': torch.float32, + 'float64': torch.float64, +} + +int_dtypes = ['int8', 'int16', 'int32', 'int64'] +float_dtypes = ['float16', 'float32', 'float64'] +dtypes = int_dtypes + float_dtypes + + +@pytest.mark.parametrize("dtype_x, dtype_z, bitcast", [ + (dtype_x, dtype_z, False) + for dtype_x in dtypes + for dtype_z in dtypes +]) +def test_fptrunc(dtype_x, dtype_z, bitcast, device='cuda'): + SIZE = 256 + # define the kernel / launch-grid + @triton.jit + def kernel(Z, X, **meta): + off = tl.arange(0, meta['SIZE']) + x = tl.load(X + off) + tl.store(Z + off, x) + # inputs + x = triton.testing.random(SIZE, dtype=cvt[dtype_x], device=device) + + # reference result + z_ref = x.type(dtype=cvt[dtype_z]) + + # triton result + z_tri = torch.zeros_like(x, dtype=cvt[dtype_z]) + + # triton.testing.assert_almost_equal(z_ref, z_tri) + + print("before kernel") + # run load and store kernel + kernel[(1, )](z_tri, x, SIZE=SIZE, num_warps=1) + print("after kernel") + + # print("x:", x) + # print("z_ref:", z_ref) + # print("z_tri:", z_tri) + # compare + print("before compare") + triton.testing.assert_almost_equal(z_ref, z_tri) + print("after compare") + + +if __name__ == '__main__': + test_fptrunc()