# configuration notes:
#
# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
# the step uses the system-wide python interpreter.

name: Self-hosted runner (scheduled)

on:
  push:
    branches:
      - multi_ci_*
  repository_dispatch:
  schedule:
    - cron: "0 0 * * *"

jobs:
  run_all_tests_torch_gpu:
    runs-on: [self-hosted, gpu, single-gpu]
    steps:
      - uses: actions/checkout@v2

      - name: Loading cache.
        uses: actions/cache@v2
        id: cache
        with:
          path: .env
          key: v  1.2-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}

      - name: Python version
        run: |
          which python
          python --version
          pip --version

      - name: Current dir
        run: pwd

      - run: nvidia-smi

      - name: Kill any run-away pytest processes
        run: (pkill -f tests; pkill -f examples) || echo "no zombies"

      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        if: steps.cache.outputs.cache-hit != 'true'
        run: |
          python -m venv .env
          source .env/bin/activate
          which python
          python --version
          pip --version

      - name: Install dependencies
        run: |
          source .env/bin/activate
          pip install --upgrade pip
          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

      - name: Run all tests on GPU
        env:
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_gpu_failures_short.txt

      - name: Run examples tests on GPU
        if: ${{ always() }}
        env:
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
          pip install -r examples/_tests_requirements.txt
          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/examples_torch_gpu_failures_short.txt

      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
        run: |
          source .env/bin/activate
          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_pipeline_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_torch_gpu_test_reports
          path: reports


  run_all_tests_tf_gpu:
    runs-on: [self-hosted, gpu, single-gpu]
    steps:
      - uses: actions/checkout@v2

      - name: Loading cache.
        uses: actions/cache@v2
        id: cache
        with:
          path: .env
          key: v1.2-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}

      - name: Python version
        run: |
          which python
          python --version
          pip --version

      - name: Current dir
        run: pwd

      - run: nvidia-smi

      - name: Kill any run-away pytest processes
        run: (pkill -f tests; pkill -f examples) || echo "no zombies"


      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        if: steps.cache.outputs.cache-hit != 'true'
        run: |
          python -m venv .env
          source .env/bin/activate
          which python
          python --version
          pip --version

      - name: Install dependencies
        run: |
          source .env/bin/activate
          pip install --upgrade pip
          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

      - name: Run all tests on GPU
        env:
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_gpu_failures_short.txt

      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
        run: |
          source .env/bin/activate
          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_pipelines_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_tf_gpu_test_reports
          path: reports

  run_all_tests_torch_multi_gpu:
    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
      - uses: actions/checkout@v2

      - name: Loading cache.
        uses: actions/cache@v2
        id: cache
        with:
          path: .env
          key: v1.2-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}

      - name: Python version
        run: |
          which python
          python --version
          pip --version

      - name: Current dir
        run: pwd

      - run: nvidia-smi

      - name: Kill any run-away pytest processes
        run: (pkill -f tests; pkill -f examples) || echo "no zombies"

      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        if: steps.cache.outputs.cache-hit != 'true'
        run: |
          python -m venv .env
          source .env/bin/activate
          which python
          python --version
          pip --version

      - name: Install dependencies
        run: |
          source .env/bin/activate
          pip install --upgrade pip
          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
          pip install fairscale
          pip install deepspeed
          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
          source .env/bin/activate
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

      - name: Run all tests on multi-GPU
        env:
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_multi_gpu_failures_short.txt

      - name: Run examples tests on multi-GPU
        if: ${{ always() }}
        env:
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
          pip install -r examples/_tests_requirements.txt
          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt

      - name: Run all pipeline tests on multi-GPU
        if: ${{ always() }}
        env:
          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
        run: |
          source .env/bin/activate
          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_torch_multi_gpu_test_reports
          path: reports

  run_all_tests_tf_multi_gpu:
    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
      - uses: actions/checkout@v2

      - name: Loading cache.
        uses: actions/cache@v2
        id: cache
        with:
          path: .env
          key: v1.2-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}

      - name: Python version
        run: |
          which python
          python --version
          pip --version

      - name: Current dir
        run: pwd

      - run: nvidia-smi

      - name: Kill any run-away pytest processes
        run: (pkill -f tests; pkill -f examples) || echo "no zombies"

      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        if: steps.cache.outputs.cache-hit != 'true'
        run: |
          python -m venv .env
          source .env/bin/activate
          which python
          python --version
          pip --version

      - name: Install dependencies
        run: |
          source .env/bin/activate
          pip install --upgrade pip
          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
          pip list

      - name: Are GPUs recognized by our DL frameworks
        run: |
          source .env/bin/activate
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

      - name: Run all tests on multi-GPU
        env:
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_multi_gpu_failures_short.txt

      - name: Run all pipeline tests on multi-GPU
        if: ${{ always() }}
        env:
          TF_FORCE_GPU_ALLOW_GROWTH: "true"
          OMP_NUM_THREADS: 1
          RUN_SLOW: yes
          RUN_PIPELINE_TESTS: yes
        run: |
          source .env/bin/activate
          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports