[RunAllTests] Fix #3752: Introduce Bazel test batching in CI (#3757)

* Initial commit. This is needed to open a PR on GitHub. This commit is being made so that the PR can start off in a broken Actions state. This also initially disables most non-Bazel workflows to make workflow iteration faster and less impacting on other team members. * Introduce infrastructure for batching. This introduces a new mechanism for passing lists of tests to sharded test targets in CI, and hooks it up. No actual sharding is occurring yet. This led to some simplifications in the CI workflow since the script can be more dynamic in computing the full list of targets (which also works around a previous bug with instrumentation tests being run). Java proto lite also needed to be upgraded for the scripts to be able to use it. More testing/documentation needed as this functionality continues to expand. * Add bucketing strategy. This simply partitions bucketed groups of targets into chunks of 10 for each run. Only 3 buckets are currently retained to test sharding in CI before introducing full support. * Fix caching & stabilize builds. Fixes some caching bucket and output bugs. Also, introduces while loop & keep_going to introduce resilience against app test build failures (or just test failures in general). * Increase sharding & add randomization. Also, enable other workflows. Note that CI shouldn't fully pass yet since some documentation and testing needs to be added yet, but this is meant to be a more realistic test of the CI environment before the PR is finished. * Improving partitionin & readability. Adds a human-readable prefix to make the shards look a bit nicer. Also, adds more fine-tuned partitioning to bucket & reduce shard counts to improve overall timing. Will need to be tested in CI. * Add new tests & fix static analysis errors. * Fix script. A newly computed variable wasn't updated to be used in an earlier change. * Fix broken tests & test configuration. Add docstrings for proto. * Fix mistake from earlier commit. * Try 10 max parallel actions instead. See #3757 (comment) for context. * Fix another error from an earlier commit.
oppia · Sep 8, 2021 · 0c98a6c · 0c98a6c
1 parent 8bc6e66
commit 0c98a6c
Show file tree

Hide file tree

Showing 24 changed files with 1,394 additions and 201 deletions.
diff --git a/.github/workflows/static_checks.yml b/.github/workflows/static_checks.yml
@@ -96,6 +96,8 @@ jobs:
   script_checks:
     name: Script Checks
     runs-on: ubuntu-18.04
+    env:
+      CACHE_DIRECTORY: ~/.bazel_cache
     steps:
       - uses: actions/checkout@v2
 
@@ -104,6 +106,47 @@ jobs:
         with:
           version: 4.0.0
 
+      - uses: actions/cache@v2
+        id: scripts_cache
+        with:
+          path: ${{ env.CACHE_DIRECTORY }}
+          key: ${{ runner.os }}-${{ env.CACHE_DIRECTORY }}-bazel-scripts-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-${{ env.CACHE_DIRECTORY }}-bazel-scripts-
+            ${{ runner.os }}-${{ env.CACHE_DIRECTORY }}-bazel-
+
+      # This check is needed to ensure that Bazel's unbounded cache growth doesn't result in a
+      # situation where the cache never updates (e.g. due to exceeding GitHub's cache size limit)
+      # thereby only ever using the last successful cache version. This solution will result in a
+      # few slower CI actions around the time cache is detected to be too large, but it should
+      # incrementally improve thereafter.
+      - name: Ensure cache size
+        env:
+          BAZEL_CACHE_DIR: ${{ env.CACHE_DIRECTORY }}
+        run: |
+          # See https://stackoverflow.com/a/27485157 for reference.
+          EXPANDED_BAZEL_CACHE_PATH="${BAZEL_CACHE_DIR/#\~/$HOME}"
+          CACHE_SIZE_MB=$(du -smc $EXPANDED_BAZEL_CACHE_PATH | grep total | cut -f1)
+          echo "Total size of Bazel cache (rounded up to MBs): $CACHE_SIZE_MB"
+          # Use a 4.5GB threshold since actions/cache compresses the results, and Bazel caches seem
+          # to only increase by a few hundred megabytes across changes for unrelated branches. This
+          # is also a reasonable upper-bound (local tests as of 2021-03-31 suggest that a full build
+          # of the codebase (e.g. //...) from scratch only requires a ~2.1GB uncompressed/~900MB
+          # compressed cache).
+          if [[ "$CACHE_SIZE_MB" -gt 4500 ]]; then
+            echo "Cache exceeds cut-off; resetting it (will result in a slow build)"
+            rm -rf $EXPANDED_BAZEL_CACHE_PATH
+          fi
+
+      - name: Configure Bazel to use a local cache
+        env:
+          BAZEL_CACHE_DIR: ${{ env.CACHE_DIRECTORY }}
+        run: |
+          EXPANDED_BAZEL_CACHE_PATH="${BAZEL_CACHE_DIR/#\~/$HOME}"
+          echo "Using $EXPANDED_BAZEL_CACHE_PATH as Bazel's cache path"
+          echo "build --disk_cache=$EXPANDED_BAZEL_CACHE_PATH" >> $HOME/.bazelrc
+        shell: bash
+
       - name: Regex Patterns Validation Check
         if: always()
         run: |
@@ -137,6 +180,9 @@ jobs:
           gh issue list --limit 2000 --repo oppia/oppia-android --json number > $(pwd)/open_issues.json
           bazel run //scripts:todo_open_check -- $(pwd) scripts/assets/todo_open_exemptions.pb open_issues.json
 
+  # Note that caching is intentionally not enabled for this check since licenses should always be
+  # verified without any potential influence from earlier builds (i.e. always from a clean build to
+  # ensure the results exactly match the current state of the repository).
   third_party_dependencies_check:
     name: Maven Dependencies Checks
     runs-on: ubuntu-18.04

diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
@@ -17,40 +17,78 @@ jobs:
     name: Compute affected tests
     runs-on: ubuntu-18.04
     outputs:
-      matrix: ${{ steps.compute-test-matrix-from-affected.outputs.matrix || steps.compute-test-matrix-from-all.outputs.matrix }}
-      have_tests_to_run: ${{ steps.compute-test-matrix-from-affected.outputs.have_tests_to_run || steps.compute-test-matrix-from-all.outputs.have_tests_to_run }}
+      matrix: ${{ steps.compute-test-matrix.outputs.matrix }}
+      have_tests_to_run: ${{ steps.compute-test-matrix.outputs.have_tests_to_run }}
+    env:
+      CACHE_DIRECTORY: ~/.bazel_cache
     steps:
       - uses: actions/checkout@v2
         with:
           fetch-depth: 0
+
       - name: Set up Bazel
         uses: abhinavsingh/setup-bazel@v3
         with:
           version: 4.0.0
-      - name: Compute test matrix based on affected targets
-        id: compute-test-matrix-from-affected
-        if: "!contains(github.event.pull_request.title, '[RunAllTests]')"
+
+      - uses: actions/cache@v2
+        id: scripts_cache
+        with:
+          path: ${{ env.CACHE_DIRECTORY }}
+          key: ${{ runner.os }}-${{ env.CACHE_DIRECTORY }}-bazel-scripts-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-${{ env.CACHE_DIRECTORY }}-bazel-scripts-
+            ${{ runner.os }}-${{ env.CACHE_DIRECTORY }}-bazel-
+
+      # This check is needed to ensure that Bazel's unbounded cache growth doesn't result in a
+      # situation where the cache never updates (e.g. due to exceeding GitHub's cache size limit)
+      # thereby only ever using the last successful cache version. This solution will result in a
+      # few slower CI actions around the time cache is detected to be too large, but it should
+      # incrementally improve thereafter.
+      - name: Ensure cache size
+        env:
+          BAZEL_CACHE_DIR: ${{ env.CACHE_DIRECTORY }}
+        run: |
+          # See https://stackoverflow.com/a/27485157 for reference.
+          EXPANDED_BAZEL_CACHE_PATH="${BAZEL_CACHE_DIR/#\~/$HOME}"
+          CACHE_SIZE_MB=$(du -smc $EXPANDED_BAZEL_CACHE_PATH | grep total | cut -f1)
+          echo "Total size of Bazel cache (rounded up to MBs): $CACHE_SIZE_MB"
+          # Use a 4.5GB threshold since actions/cache compresses the results, and Bazel caches seem
+          # to only increase by a few hundred megabytes across changes for unrelated branches. This
+          # is also a reasonable upper-bound (local tests as of 2021-03-31 suggest that a full build
+          # of the codebase (e.g. //...) from scratch only requires a ~2.1GB uncompressed/~900MB
+          # compressed cache).
+          if [[ "$CACHE_SIZE_MB" -gt 4500 ]]; then
+            echo "Cache exceeds cut-off; resetting it (will result in a slow build)"
+            rm -rf $EXPANDED_BAZEL_CACHE_PATH
+          fi
+
+      - name: Configure Bazel to use a local cache
+        env:
+          BAZEL_CACHE_DIR: ${{ env.CACHE_DIRECTORY }}
+        run: |
+          EXPANDED_BAZEL_CACHE_PATH="${BAZEL_CACHE_DIR/#\~/$HOME}"
+          echo "Using $EXPANDED_BAZEL_CACHE_PATH as Bazel's cache path"
+          echo "build --disk_cache=$EXPANDED_BAZEL_CACHE_PATH" >> $HOME/.bazelrc
+        shell: bash
+
+      - name: Compute test matrix
+        id: compute-test-matrix
+        env:
+          compute_all_targets: ${{ contains(github.event.pull_request.title, '[RunAllTests]') }}
         # https://unix.stackexchange.com/a/338124 for reference on creating a JSON-friendly
         # comma-separated list of test targets for the matrix.
         run: |
-          bazel run //scripts:compute_affected_tests -- $(pwd) $(pwd)/affected_targets.log origin/develop
-          TEST_TARGET_LIST=$(cat ./affected_targets.log | sed 's/^\|$/"/g' | paste -sd, -)
-          echo "Affected tests (note that this might be all tests if on the develop branch): $TEST_TARGET_LIST"
-          echo "::set-output name=matrix::{\"test-target\":[$TEST_TARGET_LIST]}"
-          if [[ ! -z "$TEST_TARGET_LIST" ]]; then
+          bazel run //scripts:compute_affected_tests -- $(pwd) $(pwd)/affected_targets.log origin/develop compute_all_tests=$compute_all_targets
+          TEST_BUCKET_LIST=$(cat ./affected_targets.log | sed 's/^\|$/"/g' | paste -sd, -)
+          echo "Affected tests (note that this might be all tests if configured to run all or on the develop branch): $TEST_BUCKET_LIST"
+          echo "::set-output name=matrix::{\"affected-tests-bucket-base64-encoded-shard\":[$TEST_BUCKET_LIST]}"
+          if [[ ! -z "$TEST_BUCKET_LIST" ]]; then
             echo "::set-output name=have_tests_to_run::true"
           else
             echo "::set-output name=have_tests_to_run::false"
             echo "No tests are detected as affected by this change. If this is wrong, you can add '[RunAllTests]' to the PR title to force a run."
           fi
-      - name: Compute test matrix based on all tests
-        id: compute-test-matrix-from-all
-        if: "contains(github.event.pull_request.title, '[RunAllTests]')"
-        run: |
-          TEST_TARGET_LIST=$(bazel query "kind(test, //...)" | sed 's/^\|$/"/g' | paste -sd, -)
-          echo "Affected tests (note that this might be all tests if on the develop branch): $TEST_TARGET_LIST"
-          echo "::set-output name=matrix::{\"test-target\":[$TEST_TARGET_LIST]}"
-          echo "::set-output name=have_tests_to_run::true"
 
   bazel_run_test:
     name: Run Bazel Test
@@ -59,8 +97,8 @@ jobs:
     runs-on: ubuntu-18.04
     strategy:
       fail-fast: false
-      max-parallel: 5
-      matrix: ${{fromJson(needs.bazel_compute_affected_targets.outputs.matrix)}}
+      max-parallel: 10
+      matrix: ${{ fromJson(needs.bazel_compute_affected_targets.outputs.matrix) }}
     env:
       ENABLE_CACHING: false
       CACHE_DIRECTORY: ~/.bazel_cache
@@ -77,17 +115,41 @@ jobs:
         with:
           version: 4.0.0
 
+      - uses: actions/cache@v2
+        id: scripts_cache
+        with:
+          path: ${{ env.CACHE_DIRECTORY }}
+          key: ${{ runner.os }}-${{ env.CACHE_DIRECTORY }}-bazel-scripts-${{ github.sha }}
+          restore-keys: |
+            ${{ runner.os }}-${{ env.CACHE_DIRECTORY }}-bazel-scripts-
+            ${{ runner.os }}-${{ env.CACHE_DIRECTORY }}-bazel-
+
       - name: Set up build environment
         uses: ./.github/actions/set-up-android-bazel-build-environment
 
-      - name: Compute test caching bucket
+      - name: Configure Bazel to use a local cache (for scripts)
         env:
-          TEST_TARGET: ${{ matrix.test-target }}
+          BAZEL_CACHE_DIR: ${{ env.CACHE_DIRECTORY }}
+        run: |
+          EXPANDED_BAZEL_CACHE_PATH="${BAZEL_CACHE_DIR/#\~/$HOME}"
+          echo "Using $EXPANDED_BAZEL_CACHE_PATH as Bazel's cache path"
+          echo "build --disk_cache=$EXPANDED_BAZEL_CACHE_PATH" >> $HOME/.bazelrc
+        shell: bash
+
+      - name: Extract test caching bucket & targets
+        env:
+          AFFECTED_TESTS_BUCKET_BASE64_ENCODED_SHARD: ${{ matrix.affected-tests-bucket-base64-encoded-shard }}
         run: |
-          echo "Test target: $TEST_TARGET"
-          TEST_CATEGORY=$(echo "$TEST_TARGET" | grep -oP 'org/oppia/android/(.+?)/' | cut -f 4 -d "/")
+          # See https://stackoverflow.com/a/29903172 for cut logic. This is needed to remove the
+          # user-friendly shard prefix from the matrix value.
+          AFFECTED_TESTS_BUCKET_BASE64=$(echo "$AFFECTED_TESTS_BUCKET_BASE64_ENCODED_SHARD" | cut -d ";" -f 2)
+          bazel run //scripts:retrieve_affected_tests -- $AFFECTED_TESTS_BUCKET_BASE64 $(pwd)/test_bucket_name $(pwd)/bazel_test_targets
+          TEST_CATEGORY=$(cat ./test_bucket_name)
+          BAZEL_TEST_TARGETS=$(cat ./bazel_test_targets)
           echo "Test category: $TEST_CATEGORY"
+          echo "Bazel test targets: $BAZEL_TEST_TARGETS"
           echo "TEST_CACHING_BUCKET=$TEST_CATEGORY" >> $GITHUB_ENV
+          echo "BAZEL_TEST_TARGETS=$BAZEL_TEST_TARGETS" >> $GITHUB_ENV
 
       # For reference on this & the later cache actions, see:
       # https://github.com/actions/cache/issues/239#issuecomment-606950711 &
@@ -96,7 +158,7 @@ jobs:
       # benefit from incremental build performance (assuming that actions/cache aggressively removes
       # older caches due to the 5GB cache limit size & Bazel's large cache size).
       - uses: actions/cache@v2
-        id: cache
+        id: test_cache
         with:
           path: ${{ env.CACHE_DIRECTORY }}
           key: ${{ runner.os }}-${{ env.CACHE_DIRECTORY }}-bazel-tests-${{ env.TEST_CACHING_BUCKET }}-${{ github.sha }}
@@ -129,7 +191,7 @@ jobs:
             rm -rf $EXPANDED_BAZEL_CACHE_PATH
           fi
 
-      - name: Configure Bazel to use a local cache
+      - name: Configure Bazel to use a local cache (for tests)
         env:
           BAZEL_CACHE_DIR: ${{ env.CACHE_DIRECTORY }}
         run: |
@@ -163,17 +225,45 @@ jobs:
           cd $GITHUB_WORKSPACE
           git secret reveal
 
-      - name: Run Oppia Test (with caching, non-fork only)
+      # See https://www.cyberciti.biz/faq/unix-for-loop-1-to-10/ for for-loop reference.
+      - name: Build Oppia Tests (with caching, non-fork only)
         if: ${{ env.ENABLE_CACHING == 'true' && ((github.ref == 'refs/heads/develop' && github.event_name == 'push') || (github.event.pull_request.head.repo.full_name == 'oppia/oppia-android')) }}
         env:
           BAZEL_REMOTE_CACHE_URL: ${{ secrets.BAZEL_REMOTE_CACHE_URL }}
-        run: bazel test --remote_http_cache=$BAZEL_REMOTE_CACHE_URL --google_credentials=./config/oppia-dev-workflow-remote-cache-credentials.json -- ${{ matrix.test-target }}
+          BAZEL_TEST_TARGETS: ${{ env.BAZEL_TEST_TARGETS }}
+        run: |
+          # Attempt to build 5 times in case there are flaky builds.
+          # TODO(#3759): Remove this once there are no longer app test build failures.
+          i=0
+          while [ $i -ne 5 ]; do
+            i=$(( $i+1 ))
+            bazel build --keep_going --remote_http_cache=$BAZEL_REMOTE_CACHE_URL --google_credentials=./config/oppia-dev-workflow-remote-cache-credentials.json -- $BAZEL_TEST_TARGETS
+          done
 
-      - name: Run Oppia Test (without caching, or on a fork)
+      - name: Build Oppia Tests (without caching, or on a fork)
         if: ${{ env.ENABLE_CACHING == 'false' || ((github.ref != 'refs/heads/develop' || github.event_name != 'push') && (github.event.pull_request.head.repo.full_name != 'oppia/oppia-android')) }}
+        env:
+          BAZEL_TEST_TARGETS: ${{ env.BAZEL_TEST_TARGETS }}
+        run: |
+          # Attempt to build 5 times in case there are flaky builds.
+          # TODO(#3759): Remove this once there are no longer app test build failures.
+          i=0
+          while [ $i -ne 5 ]; do
+            i=$(( $i+1 ))
+            bazel build --keep_going -- $BAZEL_TEST_TARGETS
+          done
+      - name: Run Oppia Tests (with caching, non-fork only)
+        if: ${{ env.ENABLE_CACHING == 'true' && ((github.ref == 'refs/heads/develop' && github.event_name == 'push') || (github.event.pull_request.head.repo.full_name == 'oppia/oppia-android')) }}
         env:
           BAZEL_REMOTE_CACHE_URL: ${{ secrets.BAZEL_REMOTE_CACHE_URL }}
-        run: bazel test -- ${{ matrix.test-target }}
+          BAZEL_TEST_TARGETS: ${{ env.BAZEL_TEST_TARGETS }}
+        run: bazel test --keep_going --remote_http_cache=$BAZEL_REMOTE_CACHE_URL --google_credentials=./config/oppia-dev-workflow-remote-cache-credentials.json -- $BAZEL_TEST_TARGETS
+
+      - name: Run Oppia Tests (without caching, or on a fork)
+        if: ${{ env.ENABLE_CACHING == 'false' || ((github.ref != 'refs/heads/develop' || github.event_name != 'push') && (github.event.pull_request.head.repo.full_name != 'oppia/oppia-android')) }}
+        env:
+          BAZEL_TEST_TARGETS: ${{ env.BAZEL_TEST_TARGETS }}
+        run: bazel test --keep_going -- $BAZEL_TEST_TARGETS
 
   # Reference: https://github.vviipp.vipmunity/t/127354/7.
   check_test_results:

diff --git a/scripts/BUILD.bazel b/scripts/BUILD.bazel
@@ -48,6 +48,13 @@ kt_jvm_binary(
     runtime_deps = ["//scripts/src/java/org/oppia/android/scripts/ci:compute_affected_tests_lib"],
 )
 
+kt_jvm_binary(
+    name = "retrieve_affected_tests",
+    testonly = True,
+    main_class = "org.oppia.android.scripts.ci.RetrieveAffectedTestsKt",
+    runtime_deps = ["//scripts/src/java/org/oppia/android/scripts/ci:retrieve_affected_tests_lib"],
+)
+
 # TODO(#3428): Refactor textproto assets to subpackage level.
 REGEX_PATTERN_CHECK_ASSETS = generate_regex_assets_list_from_text_protos(
     name = "regex_asset_files",

diff --git a/scripts/assets/maven_dependencies.textproto b/scripts/assets/maven_dependencies.textproto
@@ -565,8 +565,8 @@ maven_dependency {
   }
 }
 maven_dependency {
-  artifact_name: "com.google.protobuf:protobuf-lite:3.0.0"
-  artifact_version: "3.0.0"
+  artifact_name: "com.google.protobuf:protobuf-javalite:3.17.3"
+  artifact_version: "3.17.3"
   license {
     license_name: "Simplified BSD License"
     extracted_copy_link {

diff --git a/scripts/src/java/org/oppia/android/scripts/ci/BUILD.bazel b/scripts/src/java/org/oppia/android/scripts/ci/BUILD.bazel
@@ -14,5 +14,20 @@ kt_jvm_library(
     deps = [
         "//scripts/src/java/org/oppia/android/scripts/common:bazel_client",
         "//scripts/src/java/org/oppia/android/scripts/common:git_client",
+        "//scripts/src/java/org/oppia/android/scripts/common:proto_string_encoder",
+        "//scripts/src/java/org/oppia/android/scripts/proto:affected_tests_java_proto_lite",
+    ],
+)
+
+kt_jvm_library(
+    name = "retrieve_affected_tests_lib",
+    testonly = True,
+    srcs = [
+        "RetrieveAffectedTests.kt",
+    ],
+    visibility = ["//scripts:oppia_script_binary_visibility"],
+    deps = [
+        "//scripts/src/java/org/oppia/android/scripts/common:proto_string_encoder",
+        "//scripts/src/java/org/oppia/android/scripts/proto:affected_tests_java_proto_lite",
     ],
 )