From fd3e186ec23721ec8ca3eda4c926367f3cb31fba Mon Sep 17 00:00:00 2001
From: "William S. Moses" <gh@wsmoses.com>
Date: Tue, 8 Aug 2017 22:38:40 +0000
Subject: [PATCH] Squashed commit of the following:
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 9eef73e8b7b5dab5d8e04a0fa584fd765e5b1d13
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Aug 4 01:43:13 2017 +0000

    [TRE] Fix bug with Tapir modification of TRE that was causing unit tests to fail.

commit 92b16128f980b6683cb53a324480d7305f4327d4
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 3 13:10:01 2017 +0000

    [README] Attempting to clean up README file.

commit fa242e0f01133707c3a483cfabedf3ee28abba7a
Merge: a8e2b795fb3 f55a27066ac
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 3 12:52:13 2017 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit a8e2b795fb34c87cd2c884235c3b50be0c17c3e7
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 3 12:49:10 2017 +0000

    [README] Updated README.

commit f55a27066ac03e39e6a01ca30e86bc48df76fa7e
Author: William S. Moses <gh@wsmoses.com>
Date:   Tue Aug 1 20:17:47 2017 +0200

    Add CircleCI

commit 964b5bea84c59cdc7e27bc07e98f12edc821c4fc
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Aug 2 21:35:11 2017 +0000

    [LoopSpawning] Correctly handle Tapir loops where the loop body uses the variable storing the number of loop iterations.  Fixes #13

commit 8d4f443d9c9b78478279d598c4eb9abd79db1e59
Merge: 452aac7e148 ef122d645a8
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Aug 2 21:35:22 2017 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 452aac7e14852491121f7ca26f24f420414a5245
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Aug 2 21:35:11 2017 +0000

    [LoopSpawning] Correctly handle Tapir loops where the loop body uses the variable storing the number of loop iterations.  Fixes #13

commit ef122d645a83c9ad9ee743329208ee001071a4f2
Author: William S. Moses <gh@wsmoses.com>
Date:   Tue Aug 1 20:17:47 2017 +0200

    Add CircleCI

commit 9be75a22ad015c307665d277994651671a15ae60
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 10 15:57:49 2017 +0000

    [CSI] Bug fixes and refactoring of the CSI instrumentation pass.

commit 6ce5f2f27b1bc2d92e48420376c2a37d1608f3a1
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 10 13:37:39 2017 +0000

    [Tapir] Allow Tapir lowering to Cilk to fill in missing definitions of internal Cilk types, including __cilkrts_worker and __cilkrts_pedigree.

commit 631e4626d2ba614eaf8a68113c2fdf02f9f8e246
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Jun 30 21:33:54 2017 +0000

    [DetachSSA] Initial implementation of an analysis pass that tracks the creation and synchronization of detached tasks.  This analysis is based on MemorySSA.

commit 923a9052c95c43df1405fad56f2cb1ef12a47412
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jun 27 21:54:51 2017 +0000

    [Tapir] Adding support for sync regions.

    A sync region is designated by a token emitted by a call to
    @llvm.syncregion.start.  The detach, reattach, and sync instructions
    all take this token as a parameter.  A sync instruction in a sync
    region SR only waits on computations detached from detach instructions
    in the same sync region or in a detached descendant thereof.  By
    convention, a call to @llvm.syncregion.start occurs in an entry block,
    that is, either the entry block of a function or the entry block of a
    detached sub-CFG.

    For Cilk programs, a sync region is started for any function that
    performs a _Cilk_spawn or _Cilk_sync.  A separate sync region is
    also started for each _Cilk_for in the function.

    Sync regions address two issues with sync instructions.  First, with
    sync regions, the implicit sync at the end of a _Cilk_for only waits
    on the parallel iterations of that _Cilk_for, not on any other spawned
    computation within the function.  Second, when a function is inlined,
    any _Cilk_sync performed by that function will not erroneously wait on
    detached computations in its caller.

    This commit includes simple cleanup passes involving sync regions.
    One form of cleanup removes sync instructions in sync regions that
    contain no detach instructions.  Another form removes empty sync
    regions, i.e., calls to @llvm.syncregion.start whose produced token is
    never used.  Future work will analyze sync regions more carefully and
    combine them when it is deemed safe.

commit 9b55aac80aca2a520ba7627a020af413be18a29f
Merge: 9b5abba8e85 eece7bcb178
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Jun 3 12:42:01 2017 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm

commit 9b5abba8e85b01c08d49885fdc6d871ed0e522e9
Merge: 51a4df5f3e5 6ef5e10ad7e
Author: TB Schardl <neboat@mit.edu>
Date:   Wed May 31 02:07:52 2017 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm

commit 51a4df5f3e536a65c0a926ee7c87eb47c80aec7f
Merge: 6f69cdf478c 0559b4fa45c
Author: TB Schardl <neboat@mit.edu>
Date:   Tue May 30 18:19:52 2017 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm

commit 6f69cdf478cc2801c74964e3a233ad46d16245cc
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon May 15 01:15:30 2017 -0400

    remove Rhino print

commit d719d172fd8967cccb6625ff1ec54e439cdfe989
Merge: d2b4d301879 2db0ffd4753
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon May 15 01:04:30 2017 -0400

    Merge branch '6898' of github.com:wsmoses/Parallel-IR into 6898

commit d2b4d301879c0a75cbbd9d7c49e51581543ff08b
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon May 15 01:04:14 2017 -0400

    pushing rhino flag

commit 2db0ffd47534ee35deaea877d73d8484cb94c01f
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Mon May 15 00:24:54 2017 -0400

    spawn unswitch

commit 8f57e0739bf9fc6736472c89f91a533630efd5c3
Merge: 9660ce4abc0 be7eafc7179
Author: William S. Moses <wmoses@mit.edu>
Date:   Sun May 14 17:36:17 2017 -0400

    Merge branch 'master' of github.com:wsmoses/Parallel-IR into 6898

commit 9660ce4abc060598a20b7c5d30a217bdc3af569e
Merge: 002fb57bb06 780934e4b6a
Author: William S. Moses <wmoses@mit.edu>
Date:   Sun May 14 17:35:58 2017 -0400

    Merge branch 'master' into 6898

commit 002fb57bb069f18319ceab0d287c22166999a766
Merge: 35669cce54f acefa6d5a77
Author: William S. Moses <wmoses@mit.edu>
Date:   Sun May 14 15:32:41 2017 -0400

    Merge branch '6898' of github.com:wsmoses/Parallel-IR into 6898

commit acefa6d5a77cad0cb2da8f5c6cfe3af1ca15129e
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Sun May 14 14:58:08 2017 -0400

    spawn unswitch

commit be7eafc7179b8591b0007a25a2e3aae31cfc7818
Author: TB Schardl <neboat@mit.edu>
Date:   Tue May 9 21:34:49 2017 +0000

    [Mem2Reg] Updated Mem2Reg to find the entry blocks of the function and all detached sub-CFG's more efficiently.

commit 12f929ae136d57fd9e744bc2dac8c072d01e2053
Author: TB Schardl <neboat@mit.edu>
Date:   Tue May 9 21:15:58 2017 +0000

    [CilkABI] Marked additional loads and stores to CilkRTS stack frames as volatile.  Fixed bug in extracting exception-handling exit blocks for detached CFG's.

commit 9bf9a4d58c9f3a09164b8a86202bcee2f5abf553
Author: TB Schardl <neboat@mit.edu>
Date:   Tue May 9 21:14:33 2017 +0000

    [InstCombine] Fixed bug to prevent InstructionCombining pass from sinking operations that read memory across Tapir instructions.

commit 719872be7ce9d8cdbc7036c6eb7d3d77ebeff5cf
Merge: f63b0fed940 10826f2652f
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Fri Apr 28 20:39:49 2017 -0400

    Merge branch '6898' of github.com:wsmoses/Parallel-IR into 6898

commit f63b0fed9406ac9f5f8b54626a9c6ef965cceaba
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Fri Apr 28 20:39:34 2017 -0400

    pushing measuring scripts

commit 991ca791848c9936677a0b7184a77cf0eaf6734d
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Apr 26 12:17:07 2017 +0000

    [LoopSpawning] Cleaning up code for handling exceptional exits.

commit 10826f2652fea87d11ec166954c2d7b02917c21d
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Tue Apr 25 23:24:56 2017 -0400

    Alters sync elimination pfor microbenchmark.

commit 9d5172300fcd2528dc4db210beccfa6cecb7816f
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Tue Apr 25 23:07:07 2017 -0400

    Makes LoopFusePass work.

commit 46720980313325bf80262b8fd447db8e90f1c307
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Apr 26 00:10:42 2017 +0000

    [LoopSpawning] Bug fix to find all exception-handling exit blocks of a Tapir loop.

commit 48e7791f51c0a3b0fc27cc280e458892dac30fbd
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Apr 25 01:30:48 2017 +0000

    [Tapir] Preliminary support for C++ exceptions on Linux.

commit 4613a6461de60516a6242270e4c6cd7beb1c5bec
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Apr 25 01:28:09 2017 +0000

    [CSI] Updated CSI pass to support separate property types per IR object.

commit d5331895cb2d1437b7788469ac72c731b65a949b
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 15:21:03 2017 -0400

    Have makefile for sync_elimination_pfor_mb emit .ll for the sync eliminated version.

commit 3b2b3c3429af3f1a173970cef45844639d35361b
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 15:09:04 2017 -0400

    Cleans up makefile for sync_elimination_pfor_mb.

commit 21aa2bbee01f1dbc86681a7ed78b7cfd8fd611d5
Author: Bojan Serafimov <boki@mit.edu>
Date:   Sat Apr 22 14:57:32 2017 -0400

    Fix compile error

commit 0c5e6d15f12288dc29e9f08ff9d011c1204f69ba
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 14:45:38 2017 -0400

    Fixes sync_elimination_pfor_mb micro benchmark.

commit a387e9f3e16ab5253eec663bbb56c246e4dbda55
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 14:26:06 2017 -0400

    Fixes SyncElimination blow up with function calls.

commit 44e8409f071578546b572b6dd807a83092867bfa
Author: Bojan Serafimov <boki@mit.edu>
Date:   Mon Apr 10 12:06:51 2017 -0400

    Fix tests

commit adeb3eaaf5af3d9c816db1a704324c9f715a0277
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Mon Apr 10 11:46:36 2017 -0400

    Handles instructions with null call sites.

commit 96f24b65e5a4634c8a78ac0e53dd552fe46d185d
Author: Bojan Serafimov <boki@mit.edu>
Date:   Mon Apr 10 10:19:42 2017 -0400

    Ignore sync instruction in rosetta

commit d874567d6e6cdfc88c0faab3122975046162ec09
Author: Bojan Serafimov <boki@mit.edu>
Date:   Tue Apr 4 19:14:29 2017 -0400

    Add nested loop test

commit 8f7734960776d31ddcb0cf690da837c3f7ee9229
Author: Bojan Serafimov <boki@mit.edu>
Date:   Fri Mar 17 17:39:58 2017 -0400

    Fix bug in FindRosetta

commit e0bac90f990423a17e245cd6cb2d9f9f2b387951
Author: Bojan Serafimov <boki@mit.edu>
Date:   Fri Mar 17 17:03:16 2017 -0400

    Add test cases

commit 7ccc4c9454b80ef03f14a0c03d86fceea2309581
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Fri Mar 17 16:57:54 2017 -0400

    Fixes sync elimination test.

commit b5f16cfaf2ce8c9311104f356522c527cfe0b8ba
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Fri Mar 17 16:51:37 2017 -0400

    Removes incomplete sync elimination test.

commit 344d075d08c6d23be99373b1b65a94fb6f92701d
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Fri Mar 17 16:47:29 2017 -0400

    Removes function renaming in sync elimination.

commit 4045b1f2bd1d4e1ff6527bdc4349d9938e188463
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Fri Mar 17 16:15:20 2017 -0400

    Fixes loop condition error in sync elimination.

commit 7eab317e1436d2fc456f0f625ef4888577c53bec
Author: Bojan Serafimov <boki@mit.edu>
Date:   Fri Mar 17 16:33:40 2017 -0400

    Fix tests

commit 2c6412e1a4bb92a5fc86f63803a52ea22c43aa05
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Fri Mar 17 14:54:13 2017 -0400

    Implements legality check for sync elimination.

commit a57ac4cafdfe845f0c90cc0611705c38f87f1905
Author: Bojan Serafimov <boki@mit.edu>
Date:   Fri Mar 17 16:05:14 2017 -0400

    Add basic SyncElimination tests

commit a7c6bdec1a3562a9333e06497e362ab5e8e45613
Author: Bojan Serafimov <boki@mit.edu>
Date:   Mon Mar 13 11:09:06 2017 -0400

    Implement sync removing

commit 271c65cf91c5a2223ebac864cb55d6137d6d00c4
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Thu Mar 9 16:59:16 2017 -0500

    Implements Vegas-set finding for SyncElimination pass.

commit 72827d0cc4ef8b3fb556bdb4660c6b0891849b4f
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Thu Mar 9 15:58:45 2017 -0500

    Implements Rosetta-finding part of SyncElimination pass.

commit df4c672499f76bcbfdf93806755e6f9ff15035f6
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Thu Mar 9 15:08:28 2017 -0500

    Cosmetic cleanup.

commit 2682b3bf34c4efd7fc86e0af26d3a0b1dffc108f
Author: Bojan Serafimov <boki@mit.edu>
Date:   Wed Mar 8 00:52:22 2017 -0500

    Add SyncElimination pass

commit 3856a31e3af623255498bc878b750e82c90a34b7
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 16:27:38 2017 -0400

    Enables LoopFuse by default.

commit 6017d8b2a125a66cb418d247281433a5665ab249
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 16:27:26 2017 -0400

    Rebases LoopFuse to compile on the current code base.

commit 367d9d916cbaf9d2433d267bf9c70be772fe8af7
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 16:04:20 2017 -0400

    Replaces LoopAccessAnalysis with LoopAccessLegacyAnalysis in LoopFuse.

commit bb0b29851651bc1d122b7aed839a58edb4e656ce
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 15:40:47 2017 -0400

    Applies https://reviews.llvm.org/D17386 for Loop Fusion Pass.

commit 3ce522e822ad2a0b047c0cc905cf59b8f4247d26
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Sat Apr 22 14:11:36 2017 -0400

    pushing spawn work

commit 0dd0df9b42bac64d82ffe5035f6d4f5d7b2dd2b0
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 30 12:40:37 2017 +0000

    [PassManager] Re-enabling passes that happen after optimizations when Cilk is not enabled.

commit 511ba02c8ccb2bf15a0791007229389352bffef9
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 16 14:25:49 2017 +0000

    [Tapir] When outlining, propagate available alignment information to the parameters of the outined function.

commit 4722cecdb2cef0b0ab84c08f65ae296bb4c01a2f
Merge: 285ff461789 780934e4b6a
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 10 20:18:23 2017 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 285ff4617892da4132f4a0aded992dcc4c5af6d5
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 10 20:17:05 2017 +0000

    [Tapir] Fix to properly maintain allocas in the entry block of a detached context.  These changes ensure that every detached context has an entry block with just one predecessor.  These changes also move allocas among entry blocks during function inlining and the outlining process for lowering Tapir.  These changes also remove syncs associated with parallel loops after outlining.

commit 489f0a4673d2b0364556382569e421fed347d301
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 10 20:14:03 2017 +0000

    [Local] Bug fix to make the GetDetachedCtx routine to properly return the detached BB at the start of a detached context.

commit cd7e9f3c2d840182ab82830218703b78c657d1b0
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 10 20:11:56 2017 +0000

    [SimplifyCFGPass] Code cleanup and comments.

commit 35669cce54f33447d1f12423e71536ab31cf02e5
Merge: 1fae2a923fb 52889bc3118
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Mar 8 11:33:46 2017 -0500

    Merge branch '6898' of github.com:wsmoses/Parallel-IR into 6898

commit 780934e4b6a8054900b774d9405c0dd426bd23be
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 18:08:44 2017 -0500

    Parallelize / Shorten compilation

commit 4cc8071621e2c159a755a594bdb5dde9fbdfe74d
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 17:37:28 2017 -0500

    Fix optimized llvm build

commit 26007676a05e6c0445a0971f5bbfb0a2b2e9c47b
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 17:31:40 2017 -0500

    Updated binary

commit 6917c16e028fb03a608ba2e2f33ce48c68900b92
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 17:21:27 2017 -0500

    Faster cmake and autobuild matrix

commit 088941d05808f63865028347f4fcd3cbc849ce08
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:56:44 2017 -0500

    Remove old cmake

commit c558e05a3917b7be37490cd45b6c2d9fc153adbc
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:55:17 2017 -0500

    Print directories for debugging script

commit 074121e15927e674b16e2656913ecd08d557a422
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:45:52 2017 -0500

    Leave directory in autobuild after cmake

commit 30a221e0a04ae4dae0575a092800799e7aa7792f
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:38:07 2017 -0500

    Build without parallel option

commit 7a7d719c26e78e049093f1869eb6573e7cb3e529
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:32:07 2017 -0500

    Build newer cmake from source

commit 24f129bf4857357c90f8458c2ce09b60ab112b36
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:24:00 2017 -0500

    Correct ppa

commit e2bc0fc2d7edc08fb427b6f0a30862c602e57dfb
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:21:28 2017 -0500

    Change CMake to sourceline

commit c6249f0bce0d9906f5d669c6d44d15f5977e09d3
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:16:37 2017 -0500

    Attempt newer CMake

commit fe47a0078d432ee911504fa05c1af0652122dce7
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:08:27 2017 -0500

    Build PClang along with Tapir

commit 8ee564cae3bbb672546427bab5137b90ce2fdc17
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:07:36 2017 -0500

    Build intel runtime using the Tapir compiler

commit 6750684c7007e0e6ea0300498e7196cf68c52176
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:00:50 2017 -0500

    Add configure to cilk runtime building

commit 3f3b46840218f1629f1183b1ef0772414ca145c2
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 15:57:18 2017 -0500

    Add make to dependency list

commit bd6f8df75f130bcf260fc4a3102d73341d21dc1b
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 15:54:50 2017 -0500

    Add cilk runtime building

commit 6372499258146bf9da15f0153c9e4f4d288578cc
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 15:42:22 2017 -0500

    Change autobuild cmake version

commit 9fec173620bf1c3c964292485f007a69fc05ca72
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 15:39:43 2017 -0500

    Change autobuild distribution

commit 1fae2a923fb632a6eb1dabc4826e3b2533735273
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 15:35:20 2017 -0500

    Relist as package

commit 52889bc31182f3faebcfce24918670967b5b96f6
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Mon Mar 6 12:11:10 2017 -0500

    pushing example opt pass

commit fe692e250aa8a78435200882ebb89c17f881c4d3
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 3 13:25:57 2017 +0000

    Ignoring debug build directory.

commit 69fa592b7e889be513f1004b1f13dd450a1be378
Merge: 3c56ed06c17 df445de9e82
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 3 13:20:52 2017 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 3c56ed06c17f764e2c1221df60e8ee45199b1577
Merge: 4611d796dea 2d562fe758b
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 3 13:19:05 2017 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm

commit df445de9e8252e5aff8a6d7645128df71b3bd45f
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Mar 2 00:37:50 2017 -0500

    Correct CI build script

commit efa60d2d710c5697f6be5737898897cfb56b4509
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Mar 1 16:07:01 2017 -0500

    Force travis-ci to rebuild

commit 66ed989e47c276699462c761b0e4f2b68ef5d951
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Feb 28 16:18:35 2017 -0500

    Initial attempt at adding Travis autobuilder

commit b8a1f3fb7874d52fedb6db8a786695521a846709
Merge: 518873a5b44 a3bd7557fb6
Author: William Moses <taekwonbilly@gmail.com>
Date:   Tue Feb 28 11:49:18 2017 -0500

    Merge pull request #12 from YingVictor/master

    [LowerToCilk] Fix memory leak.

commit a3bd7557fb661ef0980599d430e7cd0a52f7f385
Author: Victor A. Ying <victory@csail.mit.edu>
Date:   Tue Feb 28 11:41:08 2017 -0500

    [LowerToCilk] Fix memory leak.

    SmallVector of NewHelpers needs to be deleted.

commit 518873a5b44c8ffc37282cb3887a1518525eca7f
Merge: 645daf3405c fb71c4aa6b4
Author: William Moses <taekwonbilly@gmail.com>
Date:   Sun Feb 26 17:29:34 2017 -0500

    Merge pull request #11 from YingVictor/master

    Two minor fixes

commit fb71c4aa6b408ce59e095b3d770ba01ab4eb9f51
Author: Victor A. Ying <victory@csail.mit.edu>
Date:   Sun Feb 26 16:53:55 2017 -0500

    [include/llvm-c/Transforms/Tapir.h] Fix function name mentioned in comment.

commit 2e658275b9935e536f86aec6b7f911b6c5e374cc
Author: Victor A. Ying <victory@csail.mit.edu>
Date:   Sun Feb 26 16:46:18 2017 -0500

    Properly remove traces of clang submodule.

    Removing a git submodule requires more than just deleting the the entry
    in the .gitmodules file, as was done in the previous commit. It also
    requires deleting the special directory entry from the git index,
    which should be done using some variation of "git rm", such as:
    git rm --cached path/to/submodule
    Which is what I did in this commit.

commit 645daf3405c01f6e262373a6c849466f09162f44
Author: William S. Moses <wmoses@mit.edu>
Date:   Fri Feb 24 15:35:50 2017 -0500

    Remove clang submodule

commit c9830e69c572885f6bfc7a74179a8e7efb6c851e
Merge: 3ad6c9cb76e 4611d796dea
Author: William S. Moses <wmoses@mit.edu>
Date:   Fri Feb 24 15:33:45 2017 -0500

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 3ad6c9cb76eba2c5fbf7a5c8416ac28793d6455e
Author: William S. Moses <wmoses@mit.edu>
Date:   Fri Feb 24 14:10:50 2017 -0500

    Update clang to stable

commit 4611d796dea964dea884c34cadcef14b256fbe56
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Feb 21 19:46:22 2017 +0000

    [CodeExtractor] Removed unused function from CodeExtractor.

commit 73b2a05f9106a888ae92fbd9d89fd36be310bcce
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jan 15 14:19:32 2017 +0000

    [LoopSpawning] Restored warnings when LoopSpawning fails to transform a marked loop.

commit 710c06b2ffad2727ff751113b90b9905f4a3c845
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jan 15 14:18:54 2017 +0000

    [CodeExtractor] Removing old code for dealing with debug symbols.

commit ab75cf00f520c07d4dafa58328fa809780ac146b
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Jan 13 22:25:29 2017 +0000

    [LowerToCilk] Renaming Detach2Cilk to LowerToCilk, as part of some code cleanup.

commit 2748779e158be086e9fa52300ccd5fcded978044
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jan 11 13:59:02 2017 +0000

    Updated associated version of Clang.

commit 738a76c83c83017faaeeaf959fb0c45b4586b08f
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jan 11 13:31:23 2017 +0000

    [test] Adding some simple regression tests for Tapir.

commit 5b63394d73f1d65ec6e338ed9ba8063895d8ef4e
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jan 9 19:11:44 2017 +0000

    [Tapir/Outline] Fix debug build.

commit df3dcb657228c40bff3ee7cab30944ed9e116021
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jan 9 02:31:01 2017 +0000

    [Tapir/Outline] Minor code cleanup.

commit facf7c87283b30b139fe75fbd4caacfc32c0fb37
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jan 9 02:29:07 2017 +0000

    [Detach2Cilk] Inline __cilk functions into generated helper functions.

commit c32adbf10f18c9a52e10de2e046329f67f635699
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jan 8 22:48:22 2017 +0000

    [LoopSpawning] Code cleanup for release build.

commit 3b460341f6a21344ddbc11100cd75ef079bcd8ee
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jan 8 22:41:02 2017 +0000

    [Detach2Cilk] Fixed creation of Cilk stack frames for release build.

commit 4bcdb952154d0daf4f18384cceda7f72e7b2542d
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jan 8 20:42:48 2017 +0000

    [SROA] Minor code cleanup.

commit 3c73fb9bf4d241c96c31f10c3a89074ffbf30774
Merge: 0d6f0aad70a 18687546b92
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jan 3 19:24:51 2017 +0000

    Merge branch 'new_lowering'

commit 18687546b9276fcb76c619193ee46b93f05a7001
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jan 3 17:18:12 2017 +0000

    [Detach2Cilk] Code cleanup.

commit 2a7c78c09452762cc784ac4cf92381340830a90c
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jan 3 16:59:48 2017 +0000

    [LoopSpawning] Added support for Tapir loops with exit blocks terminated by unreachable.

commit a1af329428f71f12decbe8776e2d9b4d9b377c63
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Dec 31 17:06:01 2016 +0000

    [CSI] Fix formatting of CSI pass.

commit 08b3602ddb14e7bbe7fe78faa7a12c4fbd43e431
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Dec 31 17:05:07 2016 +0000

    [CSI] Add function names to FED tables.

commit 1672db6417856784850c9aaa5f879c1bb5f6f539
Merge: a22c19d21b9 56516028d8b
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Dec 31 14:59:27 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit a22c19d21b991cd92e7f64103166f66f0f89eabd
Merge: 04b71642665 7f580b605b2
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 20 14:25:09 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit 04b716426657e5cf52c69e6e6953492e1e3b7434
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 20 14:09:15 2016 +0000

    [LoopSpawning] Switching LoopSpawning back to implementing divide-and-conquer scheduling directly.

commit c03b7f076ab44c6e37edb033cf1b16950740fca7
Merge: 0cc6919dafd eaf3712d06e
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Dec 19 21:47:05 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit 0cc6919dafdf326efdfa275f66556ad1a9abfe67
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Dec 19 20:34:25 2016 +0000

    [Outline] Cleaning up the code.

commit 747d1e8211d2c6ce8eeee40a79d3f684e9747e1c
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Dec 19 20:30:37 2016 +0000

    [LICENSE] Updated license to add copyright for changes to implement Tapir.

commit 0d6f0aad70ae0b75a4f71567bd098703070c3c56
Author: William S. Moses <wmoses@mit.edu>
Date:   Sat Dec 17 23:15:13 2016 -0500

    add clang submodule

commit 463af403bf33e14b759a60377c95ffe3d1f74382
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 13 02:28:54 2016 +0000

    [LoopSpawning] Keeping two versions of divide-and-conquer loop spawning around.

commit fcae33a06441a48081c463f74d12fc5f6b9ce68a
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 13 02:21:17 2016 +0000

    [PassManagerBuilder] Modification to support more faithful reference pipeline for PPoPP.

commit 6a8c5d26ad24a6f35ca8afcc17f18ea89f790f09
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Dec 11 22:29:25 2016 +0000

    [LoopSpawning] Fixed bug in computing loop count for using Cilk ABI call.

commit b8af887cac2f664ae780631cd14ea2a194ea042c
Author: Ubuntu <ubuntu@ip-172-31-12-183.ec2.internal>
Date:   Sun Dec 11 08:19:56 2016 +0000

    cilk abi loopspawning

commit 217f4eafa2694468cb3817fb65e05b95ddd1d0b3
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Dec 10 20:39:12 2016 +0000

    [CilkABI] Bug fix to allow proper lowering of when a loop is the entry of a detached CFG.

commit 82cb28db1a9877d923da8a038c8f33a9079b6121
Merge: 8a4ac0d5d6e 05bdd2ebfe8
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 21:20:47 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit 8a4ac0d5d6ee455a6000fd60cd37018642a2b5ba
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 15:58:29 2016 +0000

    [LoopSpawning] Refactored to be a FunctionPass, instead of a LoopPass.  More work is needed for this pass to legally add functions to the current Module.

commit 7f96f2c38f8233502a50c6bfd66257be0915ea41
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 15:55:11 2016 +0000

    [LoopSimplify] Modified to ensure that the preheader of a loop is not terminated by a sync.

commit f84012859a7fd293377b87a2c0d95d2cbd75aee0
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 15:53:05 2016 +0000

    [Tapir/Outline] Cleaning up commented-out code.

commit 2e932359c6f63a76e6a040bdf577ca9f162ddd8f
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 15:52:22 2016 +0000

    [BasicBlockUtils] Modified SplitEdge to keep sync instruction in original block.

commit 32aeb36a6f76b69247231a1b57a9b66a32627ed1
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 15:50:19 2016 +0000

    [Detach2Cilk] Making Detach2Cilk a ModulePass, instead of a FunctionPass, so it can safely add functions to the module.

commit 6ab23d5f49ab42f2d3074523570cf72cd7ee6d02
Merge: 56598980fc5 52894d83e1a
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Nov 26 17:23:45 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit e189e6c97da75849d75b512dd5513c0ec5a09af4
Merge: 6952888faaa c3bdfe57eb1
Author: Ubuntu <ubuntu@ip-172-31-13-219.ec2.internal>
Date:   Thu Nov 24 17:07:50 2016 +0000

    Bring up to date with most recent llvm

commit 56598980fc58d0bd68e2957eb45371eb23245995
Merge: 6a33185a05c 3e65807a6f1
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Nov 23 18:31:46 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit 6952888faaaf797beb00934eee0c99f85fbfeea5
Merge: e79c0d93864 e372554cd73
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Nov 11 21:42:16 2016 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit e79c0d93864a579bf6b865802e182a7b80d9ea48
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Nov 11 21:34:37 2016 +0000

    [PassManager] Ensure that extensions to the pass manager that are intended to run last only run once on Tapir programs.

commit 6a33185a05c72739458a92e13a103ed4b3ae4b97
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Nov 11 21:34:37 2016 +0000

    [PassManager] Ensure that extensions to the pass manager that are intended to run last only run once on Tapir programs.

commit 6f2c14afe41e2bb9729976b52734d98f3c99bae3
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Nov 11 21:18:30 2016 +0000

    [LoopSpawning] Ensure that calculation of a Tapir loop limit is inserted at the end of the loop's preheader.

commit e372554cd7396b1facc00f6d5df7d51f89553e31
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Nov 3 23:57:38 2016 -0400

    Remove some debug prints

commit 6baad834b9903206be5830e9a5d81cb8c118dc80
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Nov 3 23:54:44 2016 -0400

    Remove some debug prints

commit 782593d7bcd41736b148b6b128890d31f0d49f10
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Nov 1 14:40:47 2016 +0000

    [LoopSpawning] Cleaning up code and debug output.

commit f604273ecf927017dc48afdae928477f8708e0d5
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Nov 1 14:39:42 2016 +0000

    [Detach2Cilk] Should not need to inline detached helper functions anymore, because Detach2Cilk should properly handle debug symbols.

commit 20d299f2d2839b1f45b6716970f5a99ee821cec3
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Nov 1 14:37:40 2016 +0000

    [PassManagerBuilder] Run SimplifyCFG after Detach2Cilk to clean up cruft left by Detach2Cilk.

commit 1610d83dd9f26a9f47004634f83b7e5a614f46f6
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Nov 1 14:36:49 2016 +0000

    [Detach2Cilk] Fix to ensure that Phi nodes in the continuation of a detach are still valid after lowering the detach to Cilk runtime calls.

commit ea14d8bd01adccba902cdae883625698319b7d61
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Nov 1 04:42:24 2016 +0000

    [CilkABI] Converting Detach2Cilk pass to use new Tapir outlining methods, in order to handle debug symbols more correctly.

commit 1f30c735f929c5821cf575aeea59ee1b6eef3164
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Oct 31 21:56:25 2016 +0000

    [LoopSpawning] Fixed bugs to properly erase loops after performing transformation and to handle preheaders terminated by syncs.

commit a86651dd973a6f0743b4a360396dba6360fc5bdf
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Oct 31 21:54:45 2016 +0000

    [Outline] Cleaning up CreateHelper Tapir outlining method.

commit 31691cd15ae0f76c40420339849f652888294863
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Oct 31 15:38:08 2016 +0000

    [LoopSpawning] Cleaning up LoopSpawning code, and adding output to loop-spawning reports.

commit 51220e44f007bb6b5be02ecbbf2e20840634daba
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Oct 31 15:34:55 2016 +0000

    [Tapir] Renaming TapirOutline to Outline.

commit 6950ba60b07973d535c06f288e0ed30b14d43aa9
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Oct 30 19:19:15 2016 +0000

    [TargetLoweringBase] Dealing with compile warning on TargeetLoweringBase.

commit 581677b179aa2ed89134c8034ac491fae68595f0
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Oct 30 19:18:10 2016 +0000

    [LoopSpawning] Replacing Loop2Cilk with LoopSpawning.

commit 39d404b1998c4c2d3635939c27f85c70e987d70f
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Oct 30 18:54:23 2016 +0000

    [DiagnosticInfo] New method for emitting warning messages for the LoopSpawning pass.

commit 3d834b9e67f2779d2acd2bfd65d0b192561597d1
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Oct 27 21:27:33 2016 +0000

    Updating passes to run around new Loop2Cilk implementation.

commit 35ec023f57f3a240f598d2a9822ec29aedcaf48c
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Oct 27 21:25:43 2016 +0000

    Moving Tapir-specific transformations to a separate subdirectory under Transforms.

commit 3aae9e2c7b3402a3816f5b31a70a9326674c7a9f
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Oct 22 14:40:05 2016 +0000

    [Cilk] Refactoring components for lowering Tapir to Cilk runtime calls.

commit 0a92f963f5978e3f7cd91a1f77a9b3040b4a2baf
Merge: 54f16a4669d fe05c97a9eb
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Oct 22 14:33:05 2016 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 54f16a4669deaefc6a92a6f098485ee2d02d608b
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Oct 22 14:30:27 2016 +0000

    [Local] Cleaned up formatting to get rid of tabs.

commit a8fade288fdbc1e194b7b0adba5ebdf61f05cb38
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Oct 22 14:28:18 2016 +0000

    [Local] Fix to SerializeDetachedCFG to preserve debug symbols.

commit 5cc10ed3110941799eb681ad00833028ca692193
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Oct 22 14:17:40 2016 +0000

    [Instrumentation] Adding CSI instrumentation pass, copied from https://github.com/CSI-LLVM/.

commit fe05c97a9eb98c01cfaa7a1a5129b0d002e2db70
Author: William S. Moses <wmoses@mit.edu>
Date:   Sat Oct 22 10:00:23 2016 -0400

    Resolve issue 7

commit 4664388bb8c70312e21d321196942924a23955ff
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Oct 19 16:01:28 2016 +0000

    [emacs] Added detach, reattach, and sync as control instructions in LLVM's emacs mode.

commit c0e8f4fe8db4bdac7f84bbf2ce6cb8a73a9252bd
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Oct 17 04:14:35 2016 +0000

    [SSAUpdater] Derive the correct value from detached predecessors.

commit 2abd121b4c25579045347105a56b8383d0cefb9d
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 14 21:46:24 2016 +0000

    [LICM] Fixing compiler crash when LICM attempts to move a store outside of a Tapir loop.

commit 28606d0fb2e4e2bcaf37959292c2a89cedaf7a1e
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Oct 13 02:12:43 2016 +0000

    [AliasAnalysis] Minor formatting change.

commit e5e04d08d7ddad2e021d0744ef52c52048955a2c
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Oct 13 02:08:30 2016 +0000

    [InlineFunction] Preventing InlineFunction from moving alloca's out of their detached context after inlining.

commit 14719bb0513004960e3c8b0571b82981cc2b1239
Merge: 84848c51548 7f4bee18532
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Oct 6 13:53:55 2016 -0400

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 84848c51548b59b6beafa5c90615f36e64500199
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Oct 6 13:53:50 2016 -0400

    Allow full unrolling of cilk for loops

commit 7f4bee185325eebc78533ef450a45e43926da694
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Oct 6 16:51:37 2016 +0000

    [AliasAnalysis] Force AliasAnalysis to fail fast if it finds a detached CFG that reaches its own Detach instruction.

commit a2c6e22dd11c4212dbb64ce15020f677d77ed479
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Oct 4 22:44:38 2016 +0000

    [Loop2Cilk] Fix splitting of loop preheaders that are terminated by sync instructions.

commit 1d1bdcf375abd2e0e83a8500278acc6124bf16f2
Author: William S. Moses <wmoses@mit.edu>
Date:   Sun Oct 2 23:19:30 2016 -0400

    minor modref fix

commit 9ca914a946ee787fa8750a0a622d0f901641f2cf
Author: William S. Moses <wmoses@mit.edu>
Date:   Fri Sep 23 16:12:32 2016 -0400

    fix line info

commit 16395e5ae2ab1cbc17de82c0127680aeccecedc1
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Sep 22 09:08:42 2016 -0400

    Additional clean up

commit af36e03c8282f4c431260dbfe16e3c323c72b82d
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Sep 21 16:56:01 2016 -0400

    clean up unrollinng

commit 87d19e853f283cf9fac9c1e71239e34227fad27c
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Sep 21 16:48:27 2016 -0400

    resolve move to clang 4

commit 79323f66683946df1702005e3071f7fed23f0c3d
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Sep 15 15:06:36 2016 -0400

    fix tre

commit 574835b96b09f8d9b496f17c303b7a3457cd2e1f
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Sep 15 12:01:49 2016 -0400

    Fix mem2reg bug

commit 88cccc72240abd17a1dec0b2d238686919db7e81
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Sep 13 17:14:44 2016 -0400

    fix running bugs

commit f449ac224baed049d3a4eecaccaeef7ac0954e36
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon Sep 12 14:10:31 2016 -0400

    fmt

commit 1d618f6fc664f473131fa11d3b5ba495e3d1cbbd
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon Sep 12 14:08:22 2016 -0400

    fmt

commit 05d2fe180fe4980474f8e7317936b312b749e048
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon Sep 12 14:07:24 2016 -0400

    fmt

commit cb166968bc4f79b54e24272b59f935e3239109c6
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 17 22:11:31 2016 -0400

    solid

commit 1be62909730984141b5afbec84c48823735c4429
Merge: c3eb1b7594a e65e275cf2f
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 17 18:01:27 2016 -0400

    Merge remote-tracking branch 'llvm/master'

commit c3eb1b7594a5953a324015aa08f745e31fb0ec65
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 17 18:00:22 2016 -0400

    cleanup

commit 925a26d33e5aa664ed2a950bfac6f123832d28f1
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 17 17:55:49 2016 -0400

    cleanup

commit 8a4aa28bc1ac48d2073507eb365e2461b206f524
Merge: 9ee354913cb 7177ff558c7
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 17 02:54:17 2016 -0400

    merge to mainline

commit 9ee354913cb1d00c79b0173d87e8259db193d73f
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon Aug 15 01:43:52 2016 -0400

    Add race detector

commit 9b7715ebfc3bdd80382cbce7ca724868789c9cd6
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 10 00:04:31 2016 -0400

    cmake fixes

commit b66e56629e6ddd6895342d281ed510b011cecff1
Author: Ubuntu <ubuntu@ip-172-31-58-98.ec2.internal>
Date:   Fri Jul 29 21:11:20 2016 +0000

    LICM fix

commit c1aabfb01f044642dc9fb4317313d408c3cc39fc
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 27 21:22:20 2016 -0400

    add merge functions

commit 72b025f6f0d254ab7e37e7cabb42e9e27f01ede8
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 20 13:40:34 2016 -0400

    fix dt

commit 39c33184af36efb1af71591940caf1924ace5ac8
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 20 13:34:33 2016 -0400

    fix dt

commit af099d0ad6a6c263f969e2c8b577d8a6c80bd685
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 20 13:14:30 2016 -0400

    fix dt

commit 920d83fc1bed8c82c0f2ccf58379371445206469
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 20 12:12:44 2016 -0400

    fix ph issue

commit b0abbc37c6e836acf46b8703b54a0881fd499b96
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 20 11:49:12 2016 -0400

    resolve print

commit d7aa05a4ebf5866d9fe70dd3733e9e20df4fdd76
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jul 19 18:10:57 2016 -0400

    major pbbs bugfix

commit f470066edb8b7a8d8db7cef0b9a7b65f8fd8090a
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jul 19 14:31:06 2016 -0400

    fix ppbs bug

commit e1ac630d820ec2a7455392f4ddc9c4c620ea26c2
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 18 21:35:07 2016 -0400

    mod graint position

commit 0e725b855f90f63703d71a8761f717697912b65c
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 18 21:14:16 2016 -0400

    mod graint position

commit 83e0982370d9a89d4f0b0b33636511568d8eda40
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 18 16:17:40 2016 -0400

    cilk abi fixes

commit 63738d884d78c5297d1c781da81b6599e9cdeba3
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 18 13:07:38 2016 -0400

    fix recursive idx

commit 45ca520784a38bbc13b0d00597310d931c757e4b
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 18 02:25:34 2016 -0400

    fix issues with d2c extraction

commit 0e9c93c9d38a035d1ea88c2fbfbff6d6144cde0f
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun Jul 17 22:21:06 2016 -0400

    add reopt

commit ec8c23de30635cb0969514bd18068d4e2bd77ec9
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun Jul 17 22:18:39 2016 -0400

    prevent rerunning passes

commit 8d6bd63be4a6c8ebf61be02b9d2d8535de3b9484
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jul 14 13:19:44 2016 -0700

    fix asm errors

commit f83bdc1fab9bf732ea0be8b134cea617e4f85500
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jul 12 08:18:01 2016 -0700

    fix unreachable merge domtree bug

commit 662b5a7e0018b659b08dc9256dfd61f94d756f56
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 11 16:04:43 2016 -0400

    Resolve issues with bounds detection in loop2cilk

commit 4866c5da1c28d2c67dc168edf119cc4adfbc07f3
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jul 7 09:28:14 2016 -0400

    minor attr fix

commit 1f4c43c41f109f82859a88525a851f00b2e1b5e4
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 30 15:05:11 2016 -0400

    fix bounds error

commit 0caf3f63eb873abb93e06080eb875f0945c5c2df
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 30 14:13:54 2016 -0400

    speedup fix

commit 5cf555f901601c76bc416f7ef94dc77b375bcf84
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 30 12:41:46 2016 -0400

    resolve linker issues

commit 25e91bfc5f42f6eb1977cefe90336e85994d65d3
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 30 12:37:47 2016 -0400

    prevent l2c recursive loops

commit 325bce7bb19e0e4828e6f7eba6ba6420a1f59f7a
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 29 22:41:14 2016 -0400

    fix issue with loop parents

commit 8e0997cb4b85e14c83783d81a7e3815d64fc6056
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 29 21:10:51 2016 -0400

    more efficient loops

commit f302f9480f94a4e7f816707e5224c85e0bf07218
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 29 01:05:05 2016 -0400

    l2c computes grain size

commit 1dbd257083c5d5e95fa662cc99da0b150aed94e2
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 28 16:47:52 2016 -0400

    more error info for bad return state

commit ec4340b4cee3951abf49ad1636bff07cb77fb80f
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 27 17:57:49 2016 -0400

    fix accidental breakage

commit 88ceb1203926d59578e2c0dba02bf3b38f374120
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 27 14:39:50 2016 -0400

    fix loop2cilk indvar incr adding issue

commit 0a1cbbf7dff910f348713a88108169e03dabf3de
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Fri Jun 24 13:43:53 2016 -0400

    Better Parallel TRE

commit bc96f0b3f141176d1667b1700be945aed7520e9c
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Fri Jun 24 01:38:46 2016 -0400

    Parallel TRE

commit 579d39d8efab448cacf9c41aea8197226c64bfe4
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 23 13:47:13 2016 -0400

    more secure sync detect for loop2cilk

commit c06f49770a26c971efe66356b90a0a1ef7f2a301
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 22 16:57:07 2016 -0400

    Fix alloca issues for detached code

commit 150056edc4a2bb03c0bbe94923cfa189ce44f052
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 21 19:17:47 2016 -0400

    minor opt diff

commit 497c3b498bc8ce71ad913dff063853204810f402
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 21 15:02:58 2016 -0400

    modify pass

commit 01e49c3727f69e2da875989b4e61ab10fc058327
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 21 01:14:31 2016 -0400

    fix loop2cilk recog issue

commit 1c52cbf136f247110b7c9e4cac0a5a0d73ad63f7
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 21 00:35:03 2016 -0400

    remove pre sroa

commit 510bfacf5154f48e729c159c95c965acf4eef120
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 20 20:36:34 2016 -0400

    loop2cilk fixes to indvar

commit ef34ac80086a10e3ae04b9fd2ce4d99436eaa69e
Author: Ubuntu <ubuntu@ip-172-31-58-98.ec2.internal>
Date:   Mon Jun 20 19:00:07 2016 +0000

    Resolve linker errors

commit 4387eb25bb6e36f0e5f8d04c9d9d3f710864044a
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 20 14:47:48 2016 -0400

    Loop2cilk new indvar calculation

commit d4e44d43b5c6e40883975e87aa2c4c46759a8eb8
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 20 04:10:48 2016 -0400

    loop2cilk without opts

commit 9164742231eb140864e17562dd7e79161685e293
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 20 03:48:51 2016 -0400

    correct loop bounds calculation

commit d0d80c596491f3d8b7b9f2479f996f9345e9f059
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun Jun 19 00:43:55 2016 -0400

    clean up compile

commit 26beb619a1384b470ca0e668c1a838ee85b78b75
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Fri Jun 17 14:37:46 2016 -0400

    remove debug message

commit 76a163ddffdb916de1bee5fef34298e676266bff
Author: Ubuntu <ubuntu@ip-172-31-58-98.ec2.internal>
Date:   Wed Jun 15 20:58:36 2016 +0000

    nomem

commit 126c754b4f8e553e6b9ff33f899afaaf4182ee04
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 15 15:41:57 2016 -0400

    fixes and less print

commit cd037d2993381148f11954f51ff89c6b5e599086
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 23:33:28 2016 -0400

    restore cilkabi

commit 5964e893682feec3a63d17999d32c2125486e879
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 23:19:52 2016 -0400

    fix inline bug

commit b5a22ebc589fc25b72f513eb16ccbedc6482e9f2
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 14:32:41 2016 -0400

    cleanup dumps

commit 2ab9f07b81a7fb04c33926c2899c4af1753d6175
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 14:30:04 2016 -0400

    cleanup dumps

commit 56d8d0f052de051328c2077bcd47e75f34d9f034
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 12:35:26 2016 -0400

    cleanup dumps

commit d95ce1575159c12135952b3fa39a092bc77ad298
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 12:29:38 2016 -0400

    addl sroa fixes

commit 2754c0b40a4ca26d3201005a1d2796b840bdcce7
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 12:16:02 2016 -0400

    loop2cilk ordering issue for ind var calculation fixed

commit bebf5cc0565d9060e78a3caeb880b2ce8f43b36c
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 11:27:20 2016 -0400

    Fix SROA for detached allocas

commit 222ecb6dfd053282d450cbe9cffc7cea4d98fa5d
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 00:36:00 2016 -0400

    minor bugfix

commit 446ad1a3bad89a44dd2c361cc0d9417a0a07eb2b
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 13 21:59:25 2016 -0400

    bugfixes

commit bc37ee11a97c23b0576d45bcc94e7a597ff30a39
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 9 10:43:21 2016 -0400

    Fix odd LICM error

commit abfc103a0f06248526972ddd6f6057e372d56383
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 8 01:04:49 2016 -0400

    parallel opt levels and fix codegen pt 1

commit cab96d82f5d94a4a6745983953f43850d3a80f7d
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Fri Jun 3 01:43:13 2016 -0400

    fix compile script

commit 6284487a349fe982d5d24d2ff45d8ff5c8d25708
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Fri Jun 3 01:41:01 2016 -0400

    fix l2c

commit 3783dfebd1a8d94ab40b958e03ffb99ac54e3f5b
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 2 23:50:39 2016 -0400

    Fix allocation issues

commit fc2042d6a1331df9a55148208d27b2c2d4834ef7
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon May 30 15:20:22 2016 -0400

    add unique block debug info

commit cd3303d769327d50bcf3a422496190ed349cbaac
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon May 30 15:17:18 2016 -0400

    fix exit block detection l2c

commit 4865203b50d0ad69531b6459a35d557908db3ffe
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon May 30 15:02:11 2016 -0400

    fix sync l2c detection issue

commit e95a55ae8775dfe21c0ce10e0ea32332bc3d973a
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun May 29 23:31:59 2016 -0400

    allow switch and better cmp block

commit b17417485a42308842840748c73c76953302dc30
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun May 29 22:09:34 2016 -0400

    fix issues in multiple phi nodes for l2c

commit f64fca467066650bdab351a55ec38943d360fced
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun May 29 17:29:00 2016 -0400

    add addl check for loop2cilk

commit 8d9ac096f9beda10ff400631aae3336b5cb0982e
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sat May 28 22:36:56 2016 -0400

    minor script fix

commit 748021ae6a76b9d6e2ecb85b3e247455d5e9bdb9
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sat May 28 22:24:41 2016 -0400

    lots of minor cilk error fixes

commit 0132cc1ce667fd8c21adaf5b3abd5dfadac80c09
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed May 25 11:52:28 2016 -0400

    fix bug in l2c about branching into

commit 9f921005730c6c92fbdf19b36714488c72c0975e
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue May 24 23:40:12 2016 -0400

    fix bug in loop2cilk

commit a9d9cd9529c20022fd5ca0600042065cfee21d8f
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun Apr 10 14:32:22 2016 -0400

    resolve block seg

commit 7410b7bcfbf610b34a0f42c0966cbdbd2e9b2e97
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun Apr 10 13:55:01 2016 -0400

    fixes

commit 11a77b870e734e617b00e4b55f09526cf2ac37d4
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Apr 7 03:04:30 2016 -0400

    add compile

commit f2ec969a1965da3224fdffed035b9d39114d2b9a
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Apr 7 03:04:17 2016 -0400

    pre detach merging / loop unroll fixes

commit 9c00e9b80d865cf478607a4ddb90ca018ad2978c
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Apr 7 00:27:15 2016 -0400

    sync fix

commit 1f3c6dcb9d48ba519fde34c66b657571949428f7
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Apr 7 00:12:58 2016 -0400

    bug fixes

commit 0f1b1cf061ab790622c6498e0df9c5487a8d610c
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Apr 5 18:44:04 2016 -0400

    resolve delete issues

commit 86cd5870f9d667ff36b2c10971216e8f6d0977d0
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Apr 5 13:10:36 2016 -0400

    resolve delete issues

commit 06defa794acaf1f13ecdd63d57b38a49e2561492
Merge: 2f7e6ec4fa6 8b47c17a53d
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Apr 5 11:57:10 2016 -0400

    Merge remote-tracking branch 'llvm/release_38'

commit 8b47c17a53d683f313eaaa93c4a53de26d8fcba5
Author: Dimitry Andric <dimitry@andric.com>
Date:   Tue Apr 5 06:58:21 2016 +0000

    Merging r264335:
    ------------------------------------------------------------------------
    r264335 | dim | 2016-03-24 21:39:17 +0100 (Thu, 24 Mar 2016) | 17 lines

    Add <atomic> to ThreadPool.h, since std::atomic is used

    Summary:
    Apparently, when compiling with gcc 5.3.2 for powerpc64, the order of
    headers is such that it gets an error about std::atomic<> use in
    ThreadPool.h, since this header is not included explicitly.  See also:

    https://llvm.org/bugs/show_bug.cgi?id=27058

    Fix this by including <atomic>.  Patch by Bryan Drewery.

    Reviewers: chandlerc, joker.eph

    Subscribers: bdrewery, llvm-commits

    Differential Revision: http://reviews.llvm.org/D18460

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@265380 91177308-0d34-0410-b5e6-96231b3b80d8

commit 295c7a62d88d363361198766ce95900441727da9
Author: Renato Golin <renato.golin@linaro.org>
Date:   Sat Apr 2 20:36:55 2016 +0000

    Merging r263714: ARM: Revert SVN r253865, 254158, fix windows division

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@265245 91177308-0d34-0410-b5e6-96231b3b80d8

commit 2a2d901e3c55aff48990de5e415c429c4cfeb6d8
Author: Renato Golin <renato.golin@linaro.org>
Date:   Sat Apr 2 20:32:54 2016 +0000

    Merging r263123: ARM: follow up improvements for SVN r263118

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@265244 91177308-0d34-0410-b5e6-96231b3b80d8

commit 97a35e605ab417f11be4ccb532fcc9015ebb2ca8
Author: Renato Golin <renato.golin@linaro.org>
Date:   Sat Apr 2 20:31:15 2016 +0000

    Merging r263118: ARM: correct __builtin_longjmp on WoA

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@265243 91177308-0d34-0410-b5e6-96231b3b80d8

commit dec3a22cf5b8f8e6c6d1bf898f3a14bc4c54e0b4
Author: Tom Stellard <thomas.stellard@amd.com>
Date:   Mon Mar 28 18:13:48 2016 +0000

    Bump version to 3.8.1

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@264605 91177308-0d34-0410-b5e6-96231b3b80d8

commit 2f7e6ec4fa663dff11ba3dff5f74468e79c042d9
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 17 08:15:50 2016 +0000

    Cleaning up CilkABI.

commit 88a51fc0886146600e14173a0878b6567b29e3bc
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 17 08:15:05 2016 +0000

    Fixing Loop2Cilk CMakeLists entries to fix cmake build.

commit 0d0d243f395a4192bf4d85817c8ac14f5d9d8b2f
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 17 08:14:16 2016 +0000

    Fixing Loop2Cilk for merge with 'release_38'

commit 277ca2c63350507bf3ba5cd075f204e4b356fc5f
Merge: 008aa9d2441 ad5750369cc
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 17 08:09:16 2016 +0000

    Merge branch 'release_38' of http://llvm.org/git/llvm into tb-scratch

commit 008aa9d24417420734027b5072ea48cc86b428d2
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sat Mar 12 17:32:11 2016 -0500

    loop2cilk working happily

commit ea5e316db15804df27dcfaf6b790f07c8e7bd2b2
Merge: 9b3fc2538fd 1526147c0ad
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Mar 10 13:16:18 2016 -0500

    Merge branch 'tb-scratch' of ssh://github.com/taekwonbilly/Parallel-IR into tb-scratch

commit 9b3fc2538fdd9218bcb1a91b954028652579c6e4
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Mar 10 13:15:45 2016 -0500

    loop2cilk mods

commit ad5750369cc5b19f36c149f7b13151c99c7be47a
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Mar 2 23:38:03 2016 +0000

    ReleaseNotes: tidy up

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@262542 91177308-0d34-0410-b5e6-96231b3b80d8

commit 0805780408c97128dc9164d4dbb8604882f5588e
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Mar 2 23:10:55 2016 +0000

    Remove 'if you are using a released version' warning

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@262537 91177308-0d34-0410-b5e6-96231b3b80d8

commit f26161e8b05360841a1a3a4a2204ed761d6a2e04
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Mar 2 18:19:22 2016 +0000

    ReleaseNotes: C API policy; by Eric Christopher

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@262496 91177308-0d34-0410-b5e6-96231b3b80d8

commit 27c964e2ae0b573cf1e6551a3da255539db03d3c
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 26 21:37:52 2016 +0000

    ReleaseNotes: PowerPC; by Kit Barton

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@262074 91177308-0d34-0410-b5e6-96231b3b80d8

commit bb6f14e3581c78509405a3d415e72821db8a2066
Author: Quentin Colombet <qcolombet@apple.com>
Date:   Mon Feb 22 22:27:47 2016 +0000

    [AArch64] Fix bug in prolog clobbering live reg when shrink wrapping.

    This adapts r261349 to the release branch.

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261586 91177308-0d34-0410-b5e6-96231b3b80d8

commit e970b795a27d16c720bf4e3ff030eea241784eb4
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 22 21:05:14 2016 +0000

    Merging r261441, r261447, and r261546:

    ------------------------------------------------------------------------
    r261441 | nemanjai | 2016-02-20 10:16:25 -0800 (Sat, 20 Feb 2016) | 12 lines

    Fix for PR 26500

    This patch corresponds to review:
    http://reviews.llvm.org/D17294

    It ensures that whatever block we are emitting the prologue/epilogue into, we
    have the necessary scratch registers. It takes away the hard-coded register
    numbers for use as scratch registers as registers that are guaranteed to be
    available in the function prologue/epilogue are not guaranteed to be available
    within the function body. Since we shrink-wrap, the prologue/epilogue may end
    up in the function body.
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r261447 | nemanjai | 2016-02-20 12:45:37 -0800 (Sat, 20 Feb 2016) | 6 lines

    Fix the build bot break caused by rL261441.

    The patch has a necessary call to a function inside an assert. Which is fine
    when you have asserts turned on. Not so much when they're off. Sorry about
    the regression.
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r261546 | nemanjai | 2016-02-22 10:04:00 -0800 (Mon, 22 Feb 2016) | 6 lines

    Fix for PR26690 take 2

    This is what was meant to be in the initial commit to fix this bug. The
    parens were missing. This commit also adds a test case for the bug and
    has undergone full testing on PPC and X86.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261572 91177308-0d34-0410-b5e6-96231b3b80d8

commit f65e46be097186d748836d42c38a6dc7f30e6c3b
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 22 17:51:28 2016 +0000

    Merging r261387:
    ------------------------------------------------------------------------
    r261387 | davide | 2016-02-19 16:44:47 -0800 (Fri, 19 Feb 2016) | 8 lines

    [X86ISelLowering] Fix TLSADDR lowering when shrink-wrapping is enabled.

    TLSADDR nodes are lowered into actuall calls inside MC. In order to prevent
    shrink-wrapping from pushing prologue/epilogue past them (which result
    in TLS variables being accessed before the stack frame is set up), we
    put markers, so that the stack gets adjusted properly.
    Thanks to Quentin Colombet for guidance/help on how to fix this problem!

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261542 91177308-0d34-0410-b5e6-96231b3b80d8

commit e3b2bd1e79c9c9d24490b6ddb2341afcf4210691
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 22 17:47:10 2016 +0000

    Merging r261384:
    ------------------------------------------------------------------------
    r261384 | qcolombet | 2016-02-19 16:32:29 -0800 (Fri, 19 Feb 2016) | 4 lines

    [RegAllocFast] Properly track the physical register definitions on calls.

    PR26485

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261539 91177308-0d34-0410-b5e6-96231b3b80d8

commit c63a0fe41b81bac1ea6e1a053d2a8939e02edf17
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 19 21:42:57 2016 +0000

    Merging r261368:
    ------------------------------------------------------------------------
    r261368 | hans | 2016-02-19 13:40:12 -0800 (Fri, 19 Feb 2016) | 3 lines

    Revert r255691 "[LoopVectorizer] Refine loop vectorizer's register usage calculator by ignoring specific instructions."

    It caused PR26509.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261369 91177308-0d34-0410-b5e6-96231b3b80d8

commit 78e9cd40a2ea27cc9300d900a7dccc75940f9eb0
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 19 21:35:00 2016 +0000

    Merging r261360:
    ------------------------------------------------------------------------
    r261360 | dim | 2016-02-19 12:14:11 -0800 (Fri, 19 Feb 2016) | 19 lines

    Fix incorrect selection of AVX512 sqrt when OptForSize is on

    Summary:
    When optimizing for size, sqrt calls can be incorrectly selected as
    AVX512 VSQRT instructions.  This is because X86InstrAVX512.td has a
    `Requires<[OptForSize]>` in its `avx512_sqrt_scalar` multiclass
    definition.  Even if the target does not support AVX512, the class can
    apparently still be chosen, leading to an incorrect selection of
    `vsqrtss`.

    In PR26625, this lead to an assertion: Reg >= X86::FP0 && Reg <=
    X86::FP6 && "Expected FP register!", because the `vsqrtss` instruction
    requires an XMM register, which is not available on i686 CPUs.

    Reviewers: grosbach, resistor, joker.eph

    Subscribers: spatel, emaste, llvm-commits

    Differential Revision: http://reviews.llvm.org/D17414
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261367 91177308-0d34-0410-b5e6-96231b3b80d8

commit fdf40bea4fc416643210790fff4345be98d97245
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 19 21:28:08 2016 +0000

    Merging r261365:
    ------------------------------------------------------------------------
    r261365 | hans | 2016-02-19 13:26:31 -0800 (Fri, 19 Feb 2016) | 3 lines

    Revert r253557 "Alternative to long nops for X86 CPUs, by Andrey Turetsky"

    Turns out the new nop sequences aren't actually nops on x86_64 (PR26554).
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261366 91177308-0d34-0410-b5e6-96231b3b80d8

commit 413ee9f101de92d75fc11334ffeb6a054d67a18c
Author: Renato Golin <renato.golin@linaro.org>
Date:   Fri Feb 19 17:35:27 2016 +0000

    Merge r261331: avoid out of bounds loads for interleaved access vectorization

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261341 91177308-0d34-0410-b5e6-96231b3b80d8

commit 124d2bc4dc3298d2b669be23a5b640d985319b65
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 19 17:13:16 2016 +0000

    Merging r261306:
    ------------------------------------------------------------------------
    r261306 | matze | 2016-02-18 20:44:19 -0800 (Thu, 18 Feb 2016) | 1 line

    LegalizeDAG: Fix ExpandFCOPYSIGN assuming the same type on both inputs
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261334 91177308-0d34-0410-b5e6-96231b3b80d8

commit 6f28d52e9d3f87875732a0f2c1f3b03ef56be2db
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 19 00:08:56 2016 +0000

    Merging r261258:
    ------------------------------------------------------------------------
    r261258 | rnk | 2016-02-18 12:57:41 -0800 (Thu, 18 Feb 2016) | 14 lines

    [IR] Straighten out bundle overload of IRBuilder::CreateCall

    IRBuilder has two ways of putting bundle operands on calls: the default
    operand bundle, and an overload of CreateCall that takes an operand
    bundle list.

    Previously, this overload used a default argument of None. This made it
    impossible to distinguish between the case were the caller doesn't care
    about bundles, and the case where the caller explicitly wants no
    bundles. We behaved as if they wanted the latter behavior rather than
    the former, which led to problems with simplifylibcalls and WinEH.

    This change fixes it by making the parameter non-optional, so we can
    distinguish these two cases.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261282 91177308-0d34-0410-b5e6-96231b3b80d8

commit 6e961aa243f223ddb704ce708056238d7c1d7e24
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 17 19:00:40 2016 +0000

    Merging r261039:
    ------------------------------------------------------------------------
    r261039 | rnk | 2016-02-16 16:17:33 -0800 (Tue, 16 Feb 2016) | 6 lines

    [X86] Fix a shrink-wrapping miscompile around __chkstk

    __chkstk clobbers EAX. If EAX is live across the prologue, then we have
    to take extra steps to save it. We already had code to do this if EAX
    was a register parameter. This change adapts it to work when shrink
    wrapping is used.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261136 91177308-0d34-0410-b5e6-96231b3b80d8

commit ebe537a930b58a5d32fc41ac133309139c92f7bd
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:49:28 2016 +0000

    Merging r258616:
    ------------------------------------------------------------------------
    r258616 | majnemer | 2016-01-22 22:00:44 -0800 (Fri, 22 Jan 2016) | 3 lines

    [PruneEH] Don't try to insert a terminator after another terminator

    LLVM's BasicBlock has a single terminator, it is not valid to have two.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261132 91177308-0d34-0410-b5e6-96231b3b80d8

commit 9f25a0678ed9f06088a09649a040a6bef362e6af
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:49:09 2016 +0000

    Merging r258611:
    ------------------------------------------------------------------------
    r258611 | majnemer | 2016-01-22 21:41:29 -0800 (Fri, 22 Jan 2016) | 6 lines

    [PruneEH] FuncletPads must not have undef operands

    Instead of RAUW with undef, replace the first non-token instruction with
    unreachable.

    This fixes PR26263.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261131 91177308-0d34-0410-b5e6-96231b3b80d8

commit 4212ebff28e32dbd26bd93f4fa77190d80357ed4
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:48:45 2016 +0000

    Merging r258610:
    ------------------------------------------------------------------------
    r258610 | majnemer | 2016-01-22 21:41:27 -0800 (Fri, 22 Jan 2016) | 3 lines

    [PruneEH] Unify invoke and call handling in DeleteBasicBlock

    No functionality change is intended.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261130 91177308-0d34-0410-b5e6-96231b3b80d8

commit ba95fe05372c1934c30e21747480d401c1e5bcec
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:48:28 2016 +0000

    Merging r258609:
    ------------------------------------------------------------------------
    r258609 | majnemer | 2016-01-22 21:41:22 -0800 (Fri, 22 Jan 2016) | 5 lines

    [PruneEH] Reuse code from removeUnwindEdge

    PruneEH had functionality idential to removeUnwindEdge.
    Consolidate around removeUnwindEdge.
    No functionality change is intended.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261129 91177308-0d34-0410-b5e6-96231b3b80d8

commit 77c8a562e0c7c47df3bb988e2d230df6a9dcbe1d
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:42:17 2016 +0000

    Merging r259702:
    ------------------------------------------------------------------------
    r259702 | majnemer | 2016-02-03 13:30:34 -0800 (Wed, 03 Feb 2016) | 7 lines

    [LoopStrengthReduce] Don't rewrite PHIs with incoming values from CatchSwitches

    Bail out if we have a PHI on an EHPad that gets a value from a
    CatchSwitchInst.  Because the CatchSwitchInst cannot be split, there is
    no good place to stick any instructions.

    This fixes PR26373.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261126 91177308-0d34-0410-b5e6-96231b3b80d8

commit c75c50f45b3d6d1d61ce6b411d12cedaadd71d5b
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:41:44 2016 +0000

    Merging r260164:
    ------------------------------------------------------------------------
    r260164 | akaylor | 2016-02-08 14:52:51 -0800 (Mon, 08 Feb 2016) | 5 lines

    [regalloc][WinEH] Do not mark intervals as not spillable if they contain a regmask

    Differential Revision: http://reviews.llvm.org/D16831

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261125 91177308-0d34-0410-b5e6-96231b3b80d8

commit fde3338c42eb085f169ecc3817c4736075e4a683
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:41:08 2016 +0000

    Merging r260733:
    ------------------------------------------------------------------------
    r260733 | akaylor | 2016-02-12 13:10:16 -0800 (Fri, 12 Feb 2016) | 5 lines

    [WinEH] Prevent EH state numbering from skipping nested cleanup pads that never return

    Differential Revision: http://reviews.llvm.org/D17208

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261124 91177308-0d34-0410-b5e6-96231b3b80d8

commit 2507c58ca21ee01c359cd5ddf2fe84eea16366ee
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 17 17:57:26 2016 +0000

    ReleaseNotes: new Win EH instructions; by David Majnemer

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261116 91177308-0d34-0410-b5e6-96231b3b80d8

commit d77e9352a80c954cf91335c236224e4ca7d9c5f4
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 17 16:40:51 2016 +0000

    Merging r261033:
    ------------------------------------------------------------------------
    r261033 | akaylor | 2016-02-16 15:52:18 -0800 (Tue, 16 Feb 2016) | 5 lines

    Fix build LLVM with -D LLVM_USE_INTEL_JITEVENTS:BOOL=ON on Windows

    Differential Revision: http://reviews.llvm.org/D16940

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261106 91177308-0d34-0410-b5e6-96231b3b80d8

commit 7609bf251117db67abfe0d5b6622860afc769278
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 17 00:05:18 2016 +0000

    ReleaseNotes: -femultated-tls; by Chih-hung Hsieh

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261035 91177308-0d34-0410-b5e6-96231b3b80d8

commit 07fd930a2be55b0789737cd9769f0d0e42def3a7
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 23:22:17 2016 +0000

    Merging r260390:
    ------------------------------------------------------------------------
    r260390 | jyknight | 2016-02-10 09:47:20 -0800 (Wed, 10 Feb 2016) | 12 lines

    [SPARC] Repair floating-point condition encodings in assembly parser.

    The encodings for floating point conditions A(lways) and N(ever) were
    incorrectly specified for the assembly parser, per Sparc manual v8 page
    121. This change corrects that mistake.

    Also, strangely, all of the branch instructions already had MC test
    cases, except for the broken ones. Added the tests.

    Patch by Chris Dewhurst

    Differential Revision: http://reviews.llvm.org/D17074
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261031 91177308-0d34-0410-b5e6-96231b3b80d8

commit b7b1a389f7d546dbe6a67aa3bb0e66f689e99c1b
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 21:46:52 2016 +0000

    Merging r258103:
    ------------------------------------------------------------------------
    r258103 | kli | 2016-01-18 16:04:41 -0800 (Mon, 18 Jan 2016) | 2 lines

    parseArch() supports more variations of arch names for PowerPC builds

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261015 91177308-0d34-0410-b5e6-96231b3b80d8

commit fff361d60b64ac8ee9fcb523872aa7beea8ab8e1
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:37:14 2016 +0000

    ReleaseNotes: shrink-wrapping; by Quentin Colombet

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261001 91177308-0d34-0410-b5e6-96231b3b80d8

commit b129a10bb92529289bbb26d2335b12858e54a885
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:29:54 2016 +0000

    ReleaseNotes: typo

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261000 91177308-0d34-0410-b5e6-96231b3b80d8

commit d3b1222c56e9214e49a3d829e8e60910f8c88903
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:27:50 2016 +0000

    ReleaseNotes: Hexagon; by Krzysztof Parzyszek

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260999 91177308-0d34-0410-b5e6-96231b3b80d8

commit f1aaed61455e48b6c7444f706a6f997a864a42fa
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:20:40 2016 +0000

    Merging r257864 and r258112:

    ------------------------------------------------------------------------
    r257864 | axw | 2016-01-14 19:33:35 -0800 (Thu, 14 Jan 2016) | 12 lines

    [docs] Document LLVM_{BUILD,LINK}_LLVM_DYLIB

    Summary:
    Document the LLVM_BUILD_LLVM_DYLIB and LLVM_LINK_LLVM_DYLIB
    CMake options, move BUILD_SHARED_LIBS out of frequently-used,
    and add a note/warning to BUILD_SHARED_LIBS.

    Reviewers: beanz, delcypher, mjacob

    Subscribers: mjacob, llvm-commits

    Differential Revision: http://reviews.llvm.org/D16208
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r258112 | axw | 2016-01-18 21:43:21 -0800 (Mon, 18 Jan 2016) | 8 lines

    docs: address post-commit review

    Rewording/expansion of CMake options
    suggested by Dan Liew.

    See http://reviews.llvm.org/D16208.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260996 91177308-0d34-0410-b5e6-96231b3b80d8

commit 80cc2ce6475352a29e19824443c2e0a31a37b44d
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:19:03 2016 +0000

    ReleaseNotes: -DLLVM_LINK_LLVM_DYLIB=ON; by Andrew Wilkins

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260995 91177308-0d34-0410-b5e6-96231b3b80d8

commit 1e466cf4f8098acc7025f8d71dd0f64c4754ed63
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:07:38 2016 +0000

    ReleaseNotes: ORC in Kaleidoscope and C bindings; by Lang Hames

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260991 91177308-0d34-0410-b5e6-96231b3b80d8

commit b508a338d9d922a1ec3fbef698bd9fc6b5217ae0
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 17:38:25 2016 +0000

    ReleaseNotes: fix typo, reported by Eugene

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260985 91177308-0d34-0410-b5e6-96231b3b80d8

commit 4f229233ffc588a35e3738d3c358f2cf7a5da1d1
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 19:03:12 2016 +0000

    Merging r260703:
    ------------------------------------------------------------------------
    r260703 | hans | 2016-02-12 11:02:39 -0800 (Fri, 12 Feb 2016) | 11 lines

    [CMake] don't build libLTO when LLVM_ENABLE_PIC is OFF

    When cmake is run with -DLLVM_ENABLE_PIC=OFF, build fails while
    linking shared library libLTO.so, because its dependencies are built
    with -fno-PIC. More details here: https://llvm.org/bugs/show_bug.cgi?id=26484.
    This diff reverts r252652 (git 9fd4377ddb83aee3c049dc8757e7771edbb8ee71),
    which removed check NOT LLVM_ENABLE_PIC before disabling build for libLTO.so.

    Patch by Igor Sugak!

    Differential Revision: http://reviews.llvm.org/D17049
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260704 91177308-0d34-0410-b5e6-96231b3b80d8

commit 7e2ddb94a31d1d085b0228e374799566faa82b8e
Author: Peter Collingbourne <peter@pcc.me.uk>
Date:   Fri Feb 12 18:46:48 2016 +0000

    ARM: Mention r251322 in release notes.

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260702 91177308-0d34-0410-b5e6-96231b3b80d8

commit 347f4e82e80af64eca192381112ff6e9e3c7c8c3
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 17:52:29 2016 +0000

    Merging r260641:
    ------------------------------------------------------------------------
    r260641 | axw | 2016-02-11 17:42:43 -0800 (Thu, 11 Feb 2016) | 10 lines

    Avoid linking LLVM component libraries with libLLVM

    Patch by Jack Howarth.

    When linking to libLLVM, don't also link to the component
    libraries that constitute libLLVM.

    Differential Revision: http://reviews.llvm.org/D16945

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260693 91177308-0d34-0410-b5e6-96231b3b80d8

commit e469b8a4f8daa8d29fe1d1f8ed87b36114dd5726
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 16:18:07 2016 +0000

    Merging r260427:
    ------------------------------------------------------------------------
    r260427 | nha | 2016-02-10 12:13:58 -0800 (Wed, 10 Feb 2016) | 16 lines

    AMDGPU: Release the scavenged offset register during VGPR spill

    Summary:
    This fixes a crash where subsequent spills would be unable to scavenge
    a register. In particular, it fixes a crash in piglit's
    spec@glsl-1.50@execution@geometry@max-input-components (the test still
    has a shader that fails to compile because of too many SGPR spills, but
    at least it doesn't crash any more).

    This is a candidate for the release branch.

    Reviewers: arsenm, tstellarAMD

    Subscribers: qcolombet, arsenm

    Differential Revision: http://reviews.llvm.org/D16558
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260687 91177308-0d34-0410-b5e6-96231b3b80d8

commit ec95d6fe25dcb8b1450c4440da7c7a7e2982b6f2
Author: Renato Golin <renato.golin@linaro.org>
Date:   Fri Feb 12 15:29:34 2016 +0000

    [ARM/AArch64] 3.8.0 release notes changes

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260684 91177308-0d34-0410-b5e6-96231b3b80d8

commit 10a5589d08c1de3fcd715ce23697d4e591519595
Author: Dylan McKay <dylanmckay34@gmail.com>
Date:   Fri Feb 12 06:38:02 2016 +0000

    [AVR] Add release notes for 3.8

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260659 91177308-0d34-0410-b5e6-96231b3b80d8

commit 12009f63c5d16b98334930a2b97d279c6bf82ea0
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 02:32:24 2016 +0000

    ReleaseNotes: oh, there already was a section about X86

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260650 91177308-0d34-0410-b5e6-96231b3b80d8

commit fb52ed812c40eb8c6f1f69575bb231b62b319a95
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 02:29:33 2016 +0000

    ReleaseNotes: start off a 'Changes to X86' section

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260648 91177308-0d34-0410-b5e6-96231b3b80d8

commit e293d6c8d134ad352bb69defee17c5c902476933
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 01:56:35 2016 +0000

    Release Notes: RegisterScheduler::setDefault removed; by Mehdi Amini

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260643 91177308-0d34-0410-b5e6-96231b3b80d8

commit 7a0ec464f16e761602ac9c4e1f610029c0346745
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 01:42:38 2016 +0000

    Merging r260587:
    ------------------------------------------------------------------------
    r260587 | pete | 2016-02-11 13:10:40 -0800 (Thu, 11 Feb 2016) | 13 lines

    Set load alignment on aggregate loads.

    When optimizing a extractvalue(load), we generate a load from the
    aggregate type.  This load didn't have alignment set and so would
    get the alignment of the type.  This breaks when the type is packed
    and so the alignment should be lower.

    For example, loading { int, int } would give us alignment of 4, but
    the original load from this type may have an alignment of 1 if packed.

    Reviewed by David Majnemer

    Differential revision: http://reviews.llvm.org/D17158
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260640 91177308-0d34-0410-b5e6-96231b3b80d8

commit 73a8ae3c0f127d45e391bd8b40be51c2fbc15dd8
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 00:45:55 2016 +0000

    ReleaseNotes: drop in-progress warning and svn checkout note

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260634 91177308-0d34-0410-b5e6-96231b3b80d8

commit 2ec5a319cacb9e13bf20bc8b9113d11212f10aae
Author: Kai Nacke <kai.nacke@redstar.de>
Date:   Thu Feb 11 20:42:16 2016 +0000

    Add LDC compiler to list of external OS projects using LLVM 3.8

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260584 91177308-0d34-0410-b5e6-96231b3b80d8

commit 6ca6b8a0c8560555aed16b880f1499a5a0b4deda
Author: Duncan P. N. Exon Smith <dexonsmith@apple.com>
Date:   Wed Feb 10 19:20:23 2016 +0000

    ReleaseNotes: Document changes to ilist API

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260415 91177308-0d34-0410-b5e6-96231b3b80d8

commit 185bb1287f864701d9b19eef89e7838162e7c793
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 8 22:15:55 2016 +0000

    Merging r259958:
    ------------------------------------------------------------------------
    r259958 | evandro | 2016-02-05 16:01:41 -0800 (Fri, 05 Feb 2016) | 11 lines

    [AArch64] Add the scheduling model for Exynos-M1

    Summary:
    Add the core scheduling model for the Samsung Exynos-M1 (ARMv8-A).

    Reviewers: jmolloy, rengolin, christof, MinSeongKIM, t.p.northover

    Subscribers: aemerson, rengolin, MatzeB

    Differential Revision: http://reviews.llvm.org/D16644
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260156 91177308-0d34-0410-b5e6-96231b3b80d8

commit 777479f80202057f041683129d4fd9e574ffea79
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 8 18:31:49 2016 +0000

    Merging r259696:
    ------------------------------------------------------------------------
    r259696 | kfischer | 2016-02-03 13:13:33 -0800 (Wed, 03 Feb 2016) | 12 lines

    [DWARFDebug] Fix another case of overlapping ranges

    Summary:
    In r257979, I added code to ensure that we wouldn't merge DebugLocEntries if
    the pieces they describe overlap. Unfortunately, I failed to cover the case,
    where there may have multiple active Expressions in the entry, in which case we
    need to make sure that no two values overlap before we can perform the merge.

    This fixed PR26148.

    Reviewers: aprantl
    Differential Revision: http://reviews.llvm.org/D16742
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260121 91177308-0d34-0410-b5e6-96231b3b80d8

commit 7ecd92d75cda45668b6b5fdbcdd2142826514e66
Author: Daniel Sanders <daniel.sanders@imgtec.com>
Date:   Mon Feb 8 14:14:18 2016 +0000

    [mips] Add initial release notes for MIPS32.

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260095 91177308-0d34-0410-b5e6-96231b3b80d8

commit ff65de018b6bb5bc4da3e923bbc0f55c5ca8e039
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 5 22:17:38 2016 +0000

    Merging r259381:
    ------------------------------------------------------------------------
    r259381 | uweigand | 2016-02-01 10:31:19 -0800 (Mon, 01 Feb 2016) | 21 lines

    [SystemZ] Fix wrong-code generation for certain always-false conditions

    We've found another bug in the code generation logic conditions for a
    certain class of always-false conditions, those of the form
       if ((a & 1) < 0)

    These only reach the back end when compiling without optimization.

    The bug was introduced by the choice of using TEST UNDER MASK
    to implement a check for
       if ((a & MASK) < VAL)
    as
       if ((a & MASK) == 0)

    where VAL is less than the the lowest bit of MASK.  This is correct
    in all cases except for VAL == 0, in which case the original
    condition is always false, but the replacement isn't.

    Fixed by excluding that particular case.

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259940 91177308-0d34-0410-b5e6-96231b3b80d8

commit 56d368f5a52e60fa29891a6647034fffbba8713b
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 5 16:30:31 2016 +0000

    Merging r259886 and r259888:

    ------------------------------------------------------------------------
    r259886 | nemanjai | 2016-02-05 06:50:29 -0800 (Fri, 05 Feb 2016) | 5 lines

    Fix for PR 26193

    This is a simple fix for a PowerPC intrinsic that was incorrectly defined
    (the return type was incorrect).
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r259888 | nemanjai | 2016-02-05 07:03:17 -0800 (Fri, 05 Feb 2016) | 3 lines

    Add the missing test case for PR26193
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259891 91177308-0d34-0410-b5e6-96231b3b80d8

commit 9be4dc8ab20a009ed5f24610888421ba84f8ec65
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 5 00:55:39 2016 +0000

    Merging r259840 on top of r259178:

    ------------------------------------------------------------------------
    r259178 | echristo | 2016-01-28 23:20:30 -0800 (Thu, 28 Jan 2016) | 1 line

    Refactor common code for PPC fast isel load immediate selection.
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r259840 | nemanjai | 2016-02-04 15:14:42 -0800 (Thu, 04 Feb 2016) | 7 lines

    Fix for PR 26356

    Using the load immediate only when the immediate (whether signed or unsigned)
    can fit in a 16-bit signed field. Namely, from -32768 to 32767 for signed and
    0 to 65535 for unsigned. This patch also ensures that we sign-extend under the
    right conditions.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259858 91177308-0d34-0410-b5e6-96231b3b80d8

commit 12d60e9e7c149a7d333e277dfbe25a720c88c585
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 5 00:46:12 2016 +0000

    Merging r259798, r259835:

    ------------------------------------------------------------------------
    r259798 | nemanjai | 2016-02-04 08:18:08 -0800 (Thu, 04 Feb 2016) | 9 lines

    Enable the %s modifier in inline asm template string

    This patch corresponds to review:
    http://reviews.llvm.org/D16847

    There are some files in glibc that use the output operand modifier even though
    it was deprecated in GCC. This patch just adds support for it to prevent issues
    with such files.
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r259835 | nemanjai | 2016-02-04 14:36:10 -0800 (Thu, 04 Feb 2016) | 3 lines

    Provide a test case for rl259798
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259856 91177308-0d34-0410-b5e6-96231b3b80d8

commit 78a7d49140626994c23367b709e7b30b41e5cf70
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Feb 4 16:59:45 2016 +0000

    Merging r259695:
    ------------------------------------------------------------------------
    r259695 | tfiala | 2016-02-03 13:13:23 -0800 (Wed, 03 Feb 2016) | 11 lines

    Address NDEBUG-related linkage issues for Value::assertModuleIsMaterialized()

    The IR/Value class had a linkage issue present when LLVM was built
    as a library, and the LLVM library build time had different settings
    for NDEBUG than the client of the LLVM library.  Clients could get
    into a state where the LLVM lib expected
    Value::assertModuleIsMaterialized() to be inline-defined in the header
    but clients expected that method to be defined in the LLVM library.

    See this llvm-commits thread for more details:
    http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20160201/329667.html
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259801 91177308-0d34-0410-b5e6-96231b3b80d8

commit 19b86f670bb5005761ecdcbe41423fee7fd200cf
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Feb 4 02:16:36 2016 +0000

    Merging r259740:
    ------------------------------------------------------------------------
    r259740 | nemanjai | 2016-02-03 17:58:20 -0800 (Wed, 03 Feb 2016) | 2 lines

    Test case for PR 26381

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259743 91177308-0d34-0410-b5e6-96231b3b80d8

commit 0a7ec6ced609c340fc4028aa8a65996623dd4181
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 3 22:00:13 2016 +0000

    Merging r259177:
    ------------------------------------------------------------------------
    r259177 | echristo | 2016-01-28 23:20:01 -0800 (Thu, 28 Jan 2016) | 5 lines

    Since LI/LIS sign extend the constant passed into the instruction we should
    check that the sign extended constant fits into 16-bits if we want a
    zero extended value, otherwise go ahead and put it together piecemeal.

    Fixes PR26356.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259713 91177308-0d34-0410-b5e6-96231b3b80d8

commit 6b78a48f5c068df653f1c12d2ad7832aaa45c7a1
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 3 21:24:31 2016 +0000

    Merging r259649:
    ------------------------------------------------------------------------
    r259649 | jamesm | 2016-02-03 07:05:06 -0800 (Wed, 03 Feb 2016) | 11 lines

    [DemandedBits] Revert r249687 due to PR26071

    This regresses a test in LoopVectorize, so I'll need to go away and think about how to solve this in a way that isn't broken.

    From the writeup in PR26071:

    What's happening is that ComputeKnownZeroes is telling us that all bits except the LSB are zero. We're then deciding that only the LSB needs to be demanded from the icmp's inputs.

    This is where we're wrong - we're assuming that after simplification the bits that were known zero will continue to be known zero. But they're not - during trivialization the upper bits get changed (because an XOR isn't shrunk), so the icmp fails.

    The fault is in demandedbits - its contract does clearly state that a non-demanded bit may either be zero or one.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259699 91177308-0d34-0410-b5e6-96231b3b80d8

commit 18a86c95fc36b5f622e8dc87f71252de37a1ed44
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 3 21:18:35 2016 +0000

    Merging r259645:
    ------------------------------------------------------------------------
    r259645 | nemanjai | 2016-02-03 04:53:38 -0800 (Wed, 03 Feb 2016) | 4 lines

    Fix for PR 26381

    Simple fix - Constant values were not being sign extended in FastIsel.

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259698 91177308-0d34-0410-b5e6-96231b3b80d8

commit 1bfe978e5d0ac77f381b0ccef78204f7f3593a01
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 2 17:41:39 2016 +0000

    Merging r259346 (with adjustments for r258867):

    ------------------------------------------------------------------------
    r259346 | ibreger | 2016-02-01 01:57:15 -0800 (Mon, 01 Feb 2016) | 3 lines

    AVX512: fix mask handling for gather/scatter/prefetch intrinsics.

    Differential Revision: http://reviews.llvm.org/D16755
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259536 91177308-0d34-0410-b5e6-96231b3b80d8

commit f24a5b58cd7ecc4fada221308073b9f13672d6c0
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 2 17:35:07 2016 +0000

    Merging r259342 (with s/p2align 4/align 16) because r258750 is not in 3.8.

    ------------------------------------------------------------------------
    r259342 | ibreger | 2016-01-31 23:56:09 -0800 (Sun, 31 Jan 2016) | 3 lines

    AVX512 : Fix SETCCE lowering for KNL 32 bit.

    Differential Revision: http://reviews.llvm.org/D16752
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259533 91177308-0d34-0410-b5e6-96231b3b80d8

commit 5ea3635939d3e30182cd5a9881447890c8b69c42
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 1 19:18:10 2016 +0000

    Merging r259375:
    ------------------------------------------------------------------------
    r259375 | majnemer | 2016-02-01 09:37:56 -0800 (Mon, 01 Feb 2016) | 6 lines

    [InstCombine] Don't transform (X+INT_MAX)>=(Y+INT_MAX) -> (X<=Y)

    This miscompile came about because we tried to use a transform which was
    only appropriate for xor operators when addition was present.

    This fixes PR26407.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259390 91177308-0d34-0410-b5e6-96231b3b80d8

commit aad888f28ee3e920b6e1a3828398f6c9c256f3d3
Author: Tim Northover <tnorthover@apple.com>
Date:   Fri Jan 29 22:00:06 2016 +0000

    Merging r259228:
    ------------------------------------------------------------------------
    r259228 | tnorthover | 2016-01-29 11:18:46 -0800 (Fri, 29 Jan 2016) | 13 lines

    ARM: don't mangle DAG constant if it has more than one use

    The basic optimisation was to convert (mul $LHS, $complex_constant) into
    roughly "(shl (mul $LHS, $simple_constant), $simple_amt)" when it was expected
    to be cheaper. The original logic checks that the mul only has one use (since
    we're mangling $complex_constant), but when used in even more complex
    addressing modes there may be an outer addition that can pick up the wrong
    value too.

    I *think* the ARM addressing-mode problem is actually unreachable at the
    moment, but that depends on complex assessments of the profitability of
    pre-increment addressing modes so I've put a real check in there instead of an
    assertion.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259247 91177308-0d34-0410-b5e6-96231b3b80d8

commit 5ad5d2c5359a4e878c732db59ee7fc6e0a25dc00
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Jan 29 21:33:02 2016 +0000

    Merging r259236:
    ------------------------------------------------------------------------
    r259236 | spatel | 2016-01-29 12:21:02 -0800 (Fri, 29 Jan 2016) | 8 lines

    [InstCombine] avoid an insertelement transformation that induces the opposite extractelement fold (PR26354)

    We would infinite loop because we created a shufflevector that was wider than
    needed and then failed to combine that with the insertelement. When subsequently
    visiting the extractelement from that shuffle, we see that it's unnecessary,
    delete it, and trigger another visit to the insertelement.

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259245 91177308-0d34-0410-b5e6-96231b3b80d8

commit cd30d75375a03a290c6621da13cbab4f10545c56
Author: Tom Stellard <thomas.stellard@amd.com>
Date:   Fri Jan 29 16:45:55 2016 +0000

    Merging r258922:

    ------------------------------------------------------------------------
    r258922 | marek.olsak | 2016-01-27 06:19:45 -0500 (Wed, 27 Jan 2016) |
    12 lines

    AMDGPU/SI: Stoney has only 16 LDS banks

    Summary:
    This is a candidate for stable, along with all patches that add the
    "stoney"
    processor.

    Reviewers: tstellarAMD

    Subscribers: arsenm

    Differential Revision: http://reviews.llvm.org/D16485

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259207 91177308-0d34-0410-b5e6-96231b3b80d8

commit a8a522e4217a621114bedcb1cedee056c59a6273
Author: Tom Stellard <thomas.stellard@amd.com>
Date:   Fri Jan 29 16:45:52 2016 +0000

    Merging r257666:

    ------------------------------------------------------------------------
    r257666 | changpeng.fang | 2016-01-13 15:39:25 -0500 (Wed, 13 Jan 2016) | 2 lines

    AMDGPU/SI: Update ISA version for FIJI

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259206 91177308-0d34-0410-b5e6-96231b3b80d8

commit c3c52626df3d5b9bd06b160450da8335deb24dc8
Author: Daniel Sanders <daniel.sanders@imgtec.com>
Date:   Thu Jan 28 21:05:40 2016 +0000

    Bring back the test-suite export in test-release without bringing back the build failures.

    Summary:
    r257791 disabled the test-suite export since the addition of CMakeLists.txt was
    causing build failures. This patch exports the test-suite again but does so
    outside the source tree so that it isn't included in the Phase[123] builds.

    Reviewers: hans

    Subscribers: llvm-commits

    Differential Revision: http://reviews.llvm.org/D16679

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259093 91177308-0d34-0410-b5e6-96231b3b80d8

commit 72901a8afaae6c9f8ea63ba1c9c9d4699c7eec49
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Jan 28 18:23:25 2016 +0000

    Merging r258971:
    ------------------------------------------------------------------------
    r258971 | spatel | 2016-01-27 11:22:45 -0800 (Wed, 27 Jan 2016) | 26 lines

    [SimplifyCFG] limit recursion depth when speculating instructions (PR26308)

    This is a fix for:
    https://llvm.org/bugs/show_bug.cgi?id=26308

    With the switch to using the TTI cost model in:
    http://reviews.llvm.org/rL228826
    ...it became possible to hit a zero-cost cycle of instructions (gep -> phi -> gep...),
    so we need a cap for the recursion in DominatesMergePoint().

    A recursion depth parameter was already added for a different reason in:
    http://reviews.llvm.org/rL255660
    ...so we can just set a limit for it.

    I pulled "10" out of the air and made it an independent parameter that we can play with.
    It might be higher than it needs to be given the currently low default value of
    PHINodeFoldingThreshold (2). That's the starting cost value that we enter the recursion
    with, and most instructions have cost set to TCC_Basic (1), so I don't think we're going
    to speculate more than 2 instructions with the current parameters.

    As noted in the review and the TODO comment, we can do better than just limiting recursion
    depth.

    Differential Revision: http://reviews.llvm.org/D16637

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259066 91177308-0d34-0410-b5e6-96231b3b80d8

commit 131d76722983cb030c392bcb50bba940e98ea0c6
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Jan 28 18:16:55 2016 +0000

    Merging r258471:
    ------------------------------------------------------------------------
    r258471 | pirama | 2016-01-21 17:16:57 -0800 (Thu, 21 Jan 2016) | 14 lines

    Do not lower VSETCC if operand is an f16 vector

    Summary:
    SETCC with f16 vectors has OperationAction set to Expand but still gets
    lowered to FCM* intrinsics based on its result type.  This patch skips
    lowering of VSETCC if the operand is an f16 vector.

    v4 and v8 tests included.

    Reviewers: ab, jmolloy

    Subscribers: srhines, llvm-commits

    Differential Revision: http://reviews.llvm.org/D15361
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259064 91177308-0d34-0410-b5e6-96231b3b80d8

commit 82cf8c0ebce3d4cac59da2cc36df0c0cd9730d72
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 27 00:19:52 2016 +0000

    Merging r258891:
    ------------------------------------------------------------------------
    r258891 | hans | 2016-01-26 16:19:05 -0800 (Tue, 26 Jan 2016) | 25 lines

    test-release.sh: Ignore LC_CTYPE in sed invocation on Darwin

    Here, sed is used to prepare object files for comparison via cmp. On my Darwin
    15.4.0 machine, LC_CTYPE is set to UTF-8 (by default, I believe). Under these
    circumstances, anything sed is made to read will be treated as UTF-8, prompting
    it to signal an error if it is not, like so:

    % sed s/a/b/ <(head -n1 /dev/random) >/dev/null; echo $?
    sed: RE error: illegal byte sequence
    1
    %

    To make sed work as expected, I need to set LC_CTYPE to C:

    % env LC_CTYPE=C sed s/a/b/ <(head -n1 /dev/random) >/dev/null; echo $?
    0
    %

    Without this change, sed will exit with an error for every single file that it
    compares between phase 2 and phase 3, thereby making it look as if the
    differences were far larger than they are.

    Patch by Elias Pipping!

    Differential Revision: http://reviews.llvm.org/D16548
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258892 91177308-0d34-0410-b5e6-96231b3b80d8

commit 5eefadb302242035deaf04c5585bb4cd46125deb
Author: Tom Stellard <thomas.stellard@amd.com>
Date:   Tue Jan 26 23:57:01 2016 +0000

    Merging r258386:

    ------------------------------------------------------------------------
    r258386 | thomas.stellard | 2016-01-20 23:28:34 -0500 (Wed, 20 Jan 2016) | 14 lines

    AMDGPU/SI: Pass whether to use the SI scheduler via Target Attribute

    Summary:
    Currently the SI scheduler can be selected via command line option,
    but it turned out it would be better if it was selectable via a Target Attribute.

    This patch adds "si-scheduler" attribute to the backend.

    Reviewers: tstellarAMD, echristo

    Subscribers: echristo, arsenm

    Differential Revision: http://reviews.llvm.org/D16192

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258885 91177308-0d34-0410-b5e6-96231b3b80d8

commit 44fb5881d8edf448d6231a5b8df583aecd6bcd42
Author: Sanjoy Das <sanjoy@playingwithpointers.com>
Date:   Tue Jan 26 22:29:46 2016 +0000

    Merging r258184:
    ------------------------------------------------------------------------
    r258184 | sanjoy | 2016-01-19 12:53:51 -0800 (Tue, 19 Jan 2016) | 20 lines

    [SCEV] Fix PR26207

    In some cases, the max backedge taken count can be more conservative
    than the exact backedge taken count (for instance, because
    ScalarEvolution::getRange is not control-flow sensitive whereas
    computeExitLimitFromICmp can be).  In these cases,
    computeExitLimitFromCond (specifically the bit that deals with `and` and
    `or` instructions) can create an ExitLimit instance with a
    `SCEVCouldNotCompute` max backedge count expression, but a computable
    exact backedge count expression.  This violates an implicit SCEV
    assumption: a computable exact BE count should imply a computable max BE
    count.

    This change

     - Makes the above implicit invariant explicit by adding an assert to
       ExitLimit's constructor

     - Changes `computeExitLimitFromCond` to be more robust around
       conservative max backedge counts
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258869 91177308-0d34-0410-b5e6-96231b3b80d8

commit 4d1ef71f362e014aaaaefeb36abe83c24b578e40
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 26 19:44:49 2016 +0000

    Revert accidental changes from r258805

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258844 91177308-0d34-0410-b5e6-96231b3b80d8

commit 9a498947cdb25737faecfdabcb64848432c49d68
Author: Dimitry Andric <dimitry@andric.com>
Date:   Tue Jan 26 19:43:59 2016 +0000

    Merging r258436:
    ------------------------------------------------------------------------
    r258436 | dim | 2016-01-21 22:57:49 +0100 (Thu, 21 Jan 2016) | 17 lines

    Let test-release.sh checkout subprojects directly into the target tree,
    instead of using symlinks

    Summary:
    In the past I have run into several problems with the way
    `test-release.sh` creates all the subproject directories as siblings,
    and then uses symlinks to stitch them all together.  In some scenarios
    this leads to clang not being able to find header files, etc.

    This patch changes the script so it directly exports into the correct
    target locations for each subproject.

    Reviewers: hans

    Subscribers: emaste, llvm-commits

    Differential Revision: http://reviews.llvm.org/D16420
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258842 91177308-0d34-0410-b5e6-96231b3b80d8

commit 4b85564ba4a41465155b9128a68e5e14fea78365
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 26 19:31:16 2016 +0000

    Merging r258729:
    ------------------------------------------------------------------------
    r258729 | matze | 2016-01-25 14:08:25 -0800 (Mon, 25 Jan 2016) | 13 lines

    X86ISelLowering: Fix cmov(cmov) special lowering bug

    There's a special case in EmitLoweredSelect() that produces an improved
    lowering for cmov(cmov) patterns. However this special lowering is
    currently broken if the inner cmov has multiple users so this patch
    stops using it in this case.

    If you wonder why this wasn't fixed by continuing to use the special
    lowering and inserting a 2nd PHI for the inner cmov: I believe this
    would incur additional copies/register pressure so the special lowering
    does not improve upon the normal one anymore in this case.

    This fixes http://llvm.org/PR26256 (= rdar://24329747)
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258840 91177308-0d34-0410-b5e6-96231b3b80d8

commit db6cb1a90cd0ab35e2dadc97962a5d67742c0bbc
Author: James Molloy <james.molloy@arm.com>
Date:   Tue Jan 26 13:30:49 2016 +0000

    Merging r258690:
    ------------------------------------------------------------------------
    r258690 | jamesm | 2016-01-25 14:49:36 +0000 (Mon, 25 Jan 2016) | 7 lines

    [DemandedBits] Fix computation of demanded bits for ICmps

    The computation of ICmp demanded bits is independent of the individual operand being evaluated. We simply return a mask consisting of the minimum leading zeroes of both operands.

    We were incorrectly passing "I" to ComputeKnownBits - this should be "UserI->getOperand(0)". In cases where we were evaluating the 1th operand, we were taking the minimum leading zeroes of it and itself.

    This should fix PR26266.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258805 91177308-0d34-0410-b5e6-96231b3b80d8

commit 836d2ad83c5e955a23f6e3b78418cb250c95c88b
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Jan 25 22:24:50 2016 +0000

    Merging r258406:
    ------------------------------------------------------------------------
    r258406 | vedantk | 2016-01-21 09:04:42 -0800 (Thu, 21 Jan 2016) | 16 lines

    [GCOV] Avoid emitting profile arcs for module and skeleton CUs

    Do not emit profile arc files and note files for module and skeleton
    CU's.

    Our users report seeing unexpected *.gcda and *.gcno files in their
    projects when using gcov-style profiling with modules or frameworks.
    The unwanted files come from these modules. This is not very helpful
    for end-users. Further, we've seen reports of instrumented programs
    crashing while writing these files out (due to I/O failures).

    rdar://problem/22838296

    Reviewed-by: aprantl

    Differential Revision: http://reviews.llvm.org/D15997
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258731 91177308-0d34-0410-b5e6-96231b3b80d8

commit 16f83af7618a4dfb4ef1891b07adb23cc54e4c86
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Jan 22 18:37:31 2016 +0000

    Merging r258416 and r258428:

    ------------------------------------------------------------------------
    r258416 | spatel | 2016-01-21 10:01:57 -0800 (Thu, 21 Jan 2016) | 2 lines

    make helper functions static; NFCI
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r258428 | spatel | 2016-01-21 12:19:54 -0800 (Thu, 21 Jan 2016) | 15 lines

    [LibCallSimplifier] don't get fooled by a fake fmin()

    This is similar to the bug/fix:
    https://llvm.org/bugs/show_bug.cgi?id=26211
    http://reviews.llvm.org/rL258325

    The fmin() test case reveals another bug caused by sloppy
    code duplication. It will crash without this patch because
    fp128 is a valid floating-point type, but we would think
    that we had matched a function that used doubles.

    The new helper function can be used to replace similar
    checks that are used in several other places in this file.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258512 91177308-0d34-0410-b5e6-96231b3b80d8

commit c89d9654310e0f5b1171888c6573d09c9e66d0c4
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Jan 22 18:26:38 2016 +0000

    Merging r257886:
    ------------------------------------------------------------------------
    r257886 | jamesm | 2016-01-15 02:36:01 -0800 (Fri, 15 Jan 2016) | 3 lines

    [CodeGenPrepare] Try and appease sanitizers

    dupRetToEnableTailCallOpts(BB) can invalidate BB. It must run *after* we iterate across BB!
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258510 91177308-0d34-0410-b5e6-96231b3b80d8

commit 961a0e424cc7a63ee57cc8506c8a04cbf6012e1f
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 20 21:49:02 2016 +0000

    Merging r258325:
    ------------------------------------------------------------------------
    r258325 | spatel | 2016-01-20 09:41:14 -0800 (Wed, 20 Jan 2016) | 21 lines

    [LibCallSimplifier] don't get fooled by a fake sqrt()

    The test case will crash without this patch because the subsequent call to
    hasUnsafeAlgebra() assumes that the call instruction is an FPMathOperator
    (ie, returns an FP type).

    This part of the function signature check was omitted for the sqrt() case,
    but seems to be in place for all other transforms.

    Before:
    http://reviews.llvm.org/rL257400
    ...we would have needlessly continued execution in optimizeSqrt(), but the
    bug was harmless because we'd eventually fail some other check and return
    without damage.

    This should fix:
    https://llvm.org/bugs/show_bug.cgi?id=26211

    Differential Revision: http://reviews.llvm.org/D16198

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258353 91177308-0d34-0410-b5e6-96231b3b80d8

commit 3acb8d3c6d4e470172fb244c809bc6fdd7948c29
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 20 21:30:57 2016 +0000

    Merging r257940:
    ------------------------------------------------------------------------
    r257940 | djg | 2016-01-15 13:56:40 -0800 (Fri, 15 Jan 2016) | 10 lines

    [SelectionDAG] CSE nodes with differing SDNodeFlags

    In the optimizer (GVN etc.) when eliminating redundant nodes with different
    flags, the flags are ignored for the purposes of testing for congruence, and
    then intersected for the purposes of producing a result that supports the union
    of all the uses. This commit makes SelectionDAG's CSE do the same thing,
    allowing it to CSE nodes in more cases. This fixes PR26063.

    Differential Revision: http://reviews.llvm.org/D15957

    ------------------------------------------------------------------------
    Merging r257942:
    ------------------------------------------------------------------------
    r257942 | djg | 2016-01-15 14:07:35 -0800 (Fri, 15 Jan 2016) | 2 lines

    Remove a now-empty file left behind by r257940.

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258351 91177308-0d34-0410-b5e6-96231b3b80d8

commit 3260476414aa2e03566d205f742220a382f4ce07
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 20 21:14:05 2016 +0000

    Merging r258273:
    ------------------------------------------------------------------------
    r258273 | josepht | 2016-01-19 18:15:15 -0800 (Tue, 19 Jan 2016) | 37 lines

    [Inliner/WinEH] Honor implicit nounwinds

    Summary:
    Funclet EH tables require that a given funclet have only one unwind
    destination for exceptional exits.  The verifier will therefore reject
    e.g. two cleanuprets with different unwind dests for the same cleanup, or
    two invokes exiting the same funclet but to different unwind dests.
    Because catchswitch has no 'nounwind' variant, and because IR producers
    are not *required* to annotate calls which will not unwind as 'nounwind',
    it is legal to nest a call or an "unwind to caller" catchswitch within a
    funclet pad that has an unwind destination other than caller; it is
    undefined behavior for such a call or catchswitch to unwind.

    Normally when inlining an invoke, calls in the inlined sequence are
    rewritten to invokes that unwind to the callsite invoke's unwind
    destination, and "unwind to caller" catchswitches in the inlined sequence
    are rewritten to unwind to the callsite invoke's unwind destination.
    However, if such a call or "unwind to caller" catchswitch is located in a
    callee funclet that has another exceptional exit with an unwind
    destination within the callee, applying the normal transformation would
    give that callee funclet multiple unwind destinations for its exceptional
    exits.  There would be no way for EH table generation to determine which
    is the "true" exit, and the verifier would reject the function
    accordingly.

    Add logic to the inliner to detect these cases and leave such calls and
    "unwind to caller" catchswitches as calls and "unwind to caller"
    catchswitches in the inlined sequence.

    This fixes PR26147.

    Reviewers: rnk, andrew.w.kaylor, majnemer

    Subscribers: alexcrichton, llvm-commits

    Differential Revision: http://reviews.llvm.org/D16319
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258349 91177308-0d34-0410-b5e6-96231b3b80d8

commit 38e40410b1fa6441db511e760bc6ae263a8bbaee
Author: Renato Golin <renato.golin@linaro.org>
Date:   Wed Jan 20 18:01:05 2016 +0000

    Merging r258308: [AArch64] Fix two bugs in the .inst directive

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258326 91177308-0d34-0410-b5e6-96231b3b80d8

commit 870ff87d1cd25f9a2dd01d7c75489a63eca377c2
Author: Quentin Colombet <qcolombet@apple.com>
Date:   Wed Jan 20 01:14:03 2016 +0000

    Merging r258221:
    ------------------------------------------------------------------------
    r258221 | qcolombet | 2016-01-19 15:29:03 -0800 (Tue, 19 Jan 2016) | 8 lines

    [X86] Do not run shrink-wrapping on function with split-stack attribute or HiPE
    calling convention.
    The implementation of the related callbacks in the x86 backend for such
    functions are not ready to deal with a prologue block that is not the entry
    block of the function.

    This fixes PR26107, but the longer term solution would be to fix those callbacks.

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258269 91177308-0d34-0410-b5e6-96231b3b80d8

commit 90fef5a5b6514f60396e81d7fa20581d05ca659b
Author: Quentin Colombet <qcolombet@apple.com>
Date:   Wed Jan 20 01:09:12 2016 +0000

    Merging r258207:
    ------------------------------------------------------------------------
    r258207 | qcolombet | 2016-01-19 14:31:12 -0800 (Tue, 19 Jan 2016) | 1 line

    [MachineFunction] Constify getter. NFC.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258268 91177308-0d34-0410-b5e6-96231b3b80d8

commit 770ec8cf9ae215e26cb6d946b9d533151fe0558d
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 20 00:48:30 2016 +0000

    Merging r257977:
    ------------------------------------------------------------------------
    r257977 | kfischer | 2016-01-15 17:11:33 -0800 (Fri, 15 Jan 2016) | 1 line

    [DwarfDebug] Move MergeValues to .cpp, NFC
    ------------------------------------------------------------------------
    Merging r257979:
    ------------------------------------------------------------------------
    r257979 | kfischer | 2016-01-15 17:15:32 -0800 (Fri, 15 Jan 2016) | 11 lines

    [DwarfDebug] Don't merge DebugLocEntries if their pieces overlap

    Summary:
    Later in DWARF emission we check that DebugLocEntries have
    non-overlapping pieces, so we should create any such entries
    by merging here.

    Fixes PR26163.

    Reviewers: aprantl
    Differential Revision: http://reviews.llvm.org/D16249
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258267 91177308-0d34-0410-b5e6-96231b3b80d8

commit d26a2e75e58f56a289b911c0bf582be4f8f655f1
Author: NAKAMURA Takumi <geek4civic@gmail.com>
Date:   Wed Jan 20 00:32:09 2016 +0000

    [r257857] lli: use llvm::utostr() instead of std::to_string().

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258266 91177308-0d34-0410-b5e6-96231b3b80d8

commit 316ab7147bf233fd6a84977173f99b3fc9a26e0e
Author: NAKAMURA Takumi <geek4civic@gmail.com>
Date:   Wed Jan 20 00:28:22 2016 +0000

    [r257732] Mark remote-JIT tests as XFAIL, as well as win32, for targeting mingw32.

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258265 91177308-0d34-0410-b5e6-96231b3b80d8

commit f5575ecd57c4ab8cdae1a80fecc01029d14fe4e6
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 19 20:49:25 2016 +0000

    Merging r257875:
    ------------------------------------------------------------------------
    r257875 | jamesm | 2016-01-15 01:20:19 -0800 (Fri, 15 Jan 2016) | 11 lines

    [InstCombine] Rewrite bswap/bitreverse handling completely.

    There are several requirements that ended up with this design;
      1. Matching bitreversals is too heavyweight for InstCombine and doesn't really need to be done so early.
      2. Bitreversals and byteswaps are very related in their matching logic.
      3. We want to implement support for matching more advanced bswap/bitreverse patterns like partial bswaps/bitreverses.
      4. Bswaps are best matched early in InstCombine.

    The result of these is that a new utility function is created in Transforms/Utils/Local.h that can be configured to search for bswaps, bitreverses or both. InstCombine uses it to find only bswaps, CGP uses it to find only bitreversals.

    We can then extend the matching logic in one place only.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258180 91177308-0d34-0410-b5e6-96231b3b80d8

commit e12bf2aba135af15b33cca8a8c0fb80189a16b80
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 19 19:28:41 2016 +0000

    Merging r258168:
    ------------------------------------------------------------------------
    r258168 | hans | 2016-01-19 11:21:58 -0800 (Tue, 19 Jan 2016) | 3 lines

    test-release.sh: Use CMake also for Darwin

    This didn't work for 3.7, but hopefully it should work now.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258170 91177308-0d34-0410-b5e6-96231b3b80d8

commit 1618eb04cdfdd3febf77bc67cdac5307e5528b96
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 19 18:53:02 2016 +0000

    Merging r257925, r257929, r257930, and r257997:
    ------------------------------------------------------------------------
    r257925 | mren | 2016-01-15 11:35:42 -0800 (Fri, 15 Jan 2016) | 10 lines

    CXX_FAST_TLS calling convention: fix issue on X86-64.

    When we have a single basic block, the explicit copy-back instructions should
    be inserted right before the terminator. Before this fix, they were wrongly
    placed at the beginning of the basic block.

    I will commit fixes to other platforms as well.

    PR26136
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r257929 | mren | 2016-01-15 12:13:28 -0800 (Fri, 15 Jan 2016) | 10 lines

    CXX_FAST_TLS calling convention: fix issue on AArch64.

    When we have a single basic block, the explicit copy-back instructions should
    be inserted right before the terminator. Before this fix, they were wrongly
    placed at the beginning of the basic block.

    I will commit fixes to other platforms as well.

    PR26136
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r257930 | mren | 2016-01-15 12:24:11 -0800 (Fri, 15 Jan 2016) | 8 lines

    CXX_FAST_TLS calling convention: fix issue on ARM.

    When we have a single basic block, the explicit copy-back instructions should
    be inserted right before the terminator. Before this fix, they were wrongly
    placed at the beginning of the basic block.

    PR26136
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r257997 | mren | 2016-01-16 08:39:46 -0800 (Sat, 16 Jan 2016) | 12 lines

    CXX_FAST_TLS calling convention: fix issue on x86-64.

    %RBP can't be handled explicitly. We generate the following code:
        pushq %rbp
        movq  %rsp, %rbp
        ...
        movq  %rbx, (%rbp)  ## 8-byte Spill
    where %rbp will be overwritten by the spilled value.

    The fix is to let PEI handle %RBP.
    PR26136
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258162 91177308-0d34-0410-b5e6-96231b3b80d8

commit aa96fb86c3304e81c2f53700223d0e795c302276
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 19 18:26:37 2016 +0000

    Merging r257902 (and r257775)

    ------------------------------------------------------------------------
    r257775 | jyknight | 2016-01-14 08:33:21 -0800 (Thu, 14 Jan 2016) | 3 lines

    Revert "Stop increasing alignment of externally-visible globals on ELF platforms."

    This reverts commit r257719, due to PR26144.
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r257902 | jyknight | 2016-01-15 08:33:06 -0800 (Fri, 15 Jan 2016) | 17 lines

    Stop increasing alignment of externally-visible globals on ELF
    platforms.

    With ELF, the alignment of a global variable in a shared library will
    get copied into an executables linked against it, if the executable even
    accesss the variable. So, it's not possible to implicitly increase
    alignment based on access patterns, or you'll break existing binaries.

    This happened to affect libc++'s std::cout symbol, for example. See
    thread: http://thread.gmane.org/gmane.comp.compilers.clang.devel/45311

    (This is a re-commit of r257719, without the bug reported in
    PR26144. I've tweaked the code to not assert-fail in
    enforceKnownAlignment when computeKnownBits doesn't recurse far enough
    to find the underlying Alloca/GlobalObject value.)

    Differential Revision: http://reviews.llvm.org/D16145
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258155 91177308-0d34-0410-b5e6-96231b3b80d8

commit ed504bedd7420790d55e441e35e5449eaa40029e
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 19 17:28:24 2016 +0000

    Merging r257905:
    ------------------------------------------------------------------------
    r257905 | hans | 2016-01-15 09:04:45 -0800 (Fri, 15 Jan 2016) | 3 lines

    test-release.sh: Fix clang-tools-extra symlink for CMake build

    The CMake and Autoconf builds want the symlink set up differently.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258146 91177308-0d34-0410-b5e6-96231b3b80d8

commit c1316b6adfbb17b961a3bee357e728ca0d4d1c96
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Jan 14 23:24:17 2016 +0000

    Merging r257791:
    ------------------------------------------------------------------------
    r257791 | hans | 2016-01-14 11:21:14 -0800 (Thu, 14 Jan 2016) | 4 lines

    Exclude test-suite from CMake builds in test-release.sh

    It's broken. In 3.7 there wasn't a CMake build for test-suite at all,
    so we're not losing something we had before.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257836 91177308-0d34-0410-b5e6-96231b3b80d8

commit 25d64abdb39a834541edbafdc686f371dad58a76
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Jan 14 17:52:28 2016 +0000

    Merging r257730:
    ------------------------------------------------------------------------
    r257730 | majnemer | 2016-01-13 17:20:03 -0800 (Wed, 13 Jan 2016) | 11 lines

    [X86] Don't alter HasOpaqueSPAdjustment after we've relied on it

    We rely on HasOpaqueSPAdjustment not changing after we've calculated
    things based on it.  Things like whether or not we can use 'rep;movs' to
    copy bytes around, that sort of thing.  If it changes, invariants in the
    backend will quietly break.  This situation arose when we had a call to
    memcpy *and* a COPY of the FLAGS register where we would attempt to
    reference local variables using %esi, a register that was clobbered by
    the 'rep;movs'.

    This fixes PR26124.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257779 91177308-0d34-0410-b5e6-96231b3b80d8

commit 7b9eef037dbacab102881f19826fb04cfe69c7e7
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Jan 14 00:23:32 2016 +0000

    ReleaseNotes.rst: a few entries from Rafael

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257725 91177308-0d34-0410-b5e6-96231b3b80d8

commit 53d8ef00d82460b9c8ce08617d91bbce8313d4a3
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 13 21:18:59 2016 +0000

    Merging r257648:
    ------------------------------------------------------------------------
    r257648 | hans | 2016-01-13 10:59:45 -0800 (Wed, 13 Jan 2016) | 1 line

    Fix struct/class mismatch for MachineSchedContext
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257668 91177308-0d34-0410-b5e6-96231b3b80d8

commit 38fcb6f10f0ae867bfe796f26bf1a336bf0dddf0
Author: Dimitry Andric <dimitry@andric.com>
Date:   Wed Jan 13 19:37:51 2016 +0000

    Merging r257645:
    ------------------------------------------------------------------------
    r257645 | dim | 2016-01-13 19:29:46 +0100 (Wed, 13 Jan 2016) | 22 lines

    Avoid undefined behavior in LinkAllPasses.h

    The LinkAllPasses.h file is included in several main programs, to force
    a large number of passes to be linked in.  However, the ForcePassLinking
    constructor uses undefined behavior, since it calls member functions on
    `nullptr`, e.g.:

          ((llvm::Function*)nullptr)->viewCFGOnly();
          llvm::RGPassManager RGM;
          ((llvm::RegionPass*)nullptr)->runOnRegion((llvm::Region*)nullptr, RGM);

    When the optimization level is -O2 or higher, the code below the first
    nullptr dereference is optimized away, and replaced by `ud2` (on x86).

    Therefore, the calls after that first dereference are never emitted.  In
    my case, I noticed there was no call to `llvm::sys::RunningOnValgrind()`!

    Replace instances of dereferencing `nullptr` with either objects on the
    stack, or regular function calls.

    Differential Revision: http://reviews.llvm.org/D15996

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257660 91177308-0d34-0410-b5e6-96231b3b80d8

commit 9faaefea9cbef6453486ed825c1ca4305bf68324
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 13 19:03:44 2016 +0000

    Drop 'svn' suffix from version.

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257651 91177308-0d34-0410-b5e6-96231b3b80d8

commit 5ab5731312b6a8736fbe7fad1cb10f384b3a295e
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 13 17:34:56 2016 +0000

    Creating release_38 branch off revision 257626

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257630 91177308-0d34-0410-b5e6-96231b3b80d8

commit 1526147c0ad76667de046ef168d5cc5eee381bb7
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jan 12 12:40:37 2016 +0000

    Bug fix to include setSuccessor method on reattach instruction.

commit 2b1b34e00cbc085a4a9a290c65fffaabae9517dc
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Dec 31 04:05:48 2015 +0000

    Add -instrument-cilk support to detach2cilk, cilkabi

commit 4328b4468c0e42c1f89f5212e1386c38394edf20
Merge: 062301f913b 8a32dc47d61
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Dec 30 01:45:54 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 062301f913b5ac657607f0c758392ac8a18d5c13
Merge: 9893cc49b22 48a798cb4b4
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 29 22:23:46 2015 +0000

    Merge branch 'tb-scratch' of github.com:taekwonbilly/Parallel-IR into tb-scratch

commit 9893cc49b223291071ea6633cd3f5c376acce9dd
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 29 22:22:01 2015 +0000

    SimplifyCFG now removes unncessary Sync instructions.

commit 48a798cb4b473470ad6ceaa6cc3e45dd569d0627
Merge: 54dbddeaec7 8d00ea68834
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Nov 11 10:50:51 2015 -0500

    for counting

commit 54dbddeaec7fa2bcdb3ad906c2cb99232342f00b
Merge: 19481e914d1 88d51ce445e
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Nov 11 10:18:55 2015 -0500

    moded

commit 8d00ea68834b61ce260b8111beb594cbdc8c78b9
Merge: 2ae39eb69c5 65cad952e45
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Nov 6 11:51:30 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 2ae39eb69c54cfb2206514873bca9cb1ac3738b0
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Nov 5 14:58:05 2015 +0000

    [SimplifyCFG] Fixed bug where empty reattach blocks with multiple predecessors would crash this pass.

commit 7bd0f59e1aa75abe8a238d1ec166d6148722ebdd
Merge: 8ae8e06e3cd c135da21a3c
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Nov 4 02:12:41 2015 +0000

    Merge branch 'billy-scratch' of github.com:taekwonbilly/Parallel-IR into tb-scratch

commit 8ae8e06e3cdf762ce50de096115ecfac5c998b63
Merge: a9530cd93a2 7e6636cb71f
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Nov 4 01:26:22 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit c135da21a3cca833224099aeeac85aad0ec5144d
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Mon Nov 2 23:13:23 2015 -0500

    all cleaned up & ready to go

commit a9530cd93a293b6e21665883a74b42859061acd8
Merge: 329f5fad3f7 1965754e592
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 16 14:24:35 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 329f5fad3f72dd84a3e4cf5818512a6b7e81c657
Merge: e0717ad48cc 600b09339de
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 16 00:37:12 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit e0717ad48cc7c447b4f1159116b06ff82c4efdd3
Merge: 20e95d87b5e 4b6405d130b
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 16 00:36:05 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 20e95d87b5e8234390f2b4cc6ef46a5ebea58e0c
Merge: 44d4e427c7f bcd41c02dde
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Oct 13 16:57:43 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit cac7ff23aac4127106c74d7cdaa5b6f11d3d5e00
Merge: ab253e4510c 387b1f61aad
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Oct 13 12:34:49 2015 -0400

    Merge branch 'master' of github.com:taekwonbilly/Parallel-IR into billy-scratch

commit ab253e4510c21e111e4c56fda345c19d3b232650
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Oct 13 12:34:31 2015 -0400

    cache loop2cilk

commit 44d4e427c7f008295af785fbad29857952be6d9a
Merge: 387b1f61aad 938c3d3164e
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Oct 13 12:52:44 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 387b1f61aad986ddc9032d82e2e48e9c5e1b064d
Merge: 81e2fd12aea 3d58b720c31
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 2 19:47:27 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 81e2fd12aea84c2ab59cd73cbcad2665a947ce0b
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 2 19:43:24 2015 +0000

    Adding Detach2Cilksan pass to enable Cilksan race detection.

commit 7a634e24c5bc7a520e8979646da17c09895f5425
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 2 19:42:47 2015 +0000

    Some debugging of Detach2Cilk

commit cdf14afd5eeb21dedc32c3a62b1f76af95016974
Merge: 25f43658061 36caf0659ff
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 22 17:53:11 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 25f436580618875268ef313894e05802617bbdf0
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 22 04:00:34 2015 +0000

    Fixing loop rotation to prevent it from destroying sync instructions.

commit 8ec1e7597748edd42654657f992aa4209bd04cf9
Merge: 4fc3d85490a dabf510ba1b
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Sep 20 19:06:45 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 4fc3d85490a81d6adbd21b5f66646a9f397fe333
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Sep 20 19:06:24 2015 +0000

    Fixed GVN to handle scalarPRE around detach/reattach and to abort load PRE in the event of an aliased access from a detach or sync.

commit dc7cd94ca46ba477e113d2844de893b82b95b081
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Sep 20 19:05:06 2015 +0000

    Updated AliasAnalysis to analyze detached blocks for aliasing information for detach and sync instructions.

commit 421d2351ba4e14ff211a3c6cbe9258ccddf19afa
Merge: 54b97afc6bc 29f50e97835
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Sep 13 12:11:13 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 54b97afc6bc145d1e28a8a3c94de524d809cddf1
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Sep 9 20:25:28 2015 +0000

    Making syncs look like fences, in order to fix memory analysis issues.

commit 4420c17e34959d2a33ba4c9fd9ae5ff6066f797a
Merge: e6d3b51ad7d 3c76435341d
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Sep 9 01:12:29 2015 +0000

    Merge branch 'billy-scratch' of github.com:taekwonbilly/Parallel-IR into tb-scratch

commit e6d3b51ad7de5aaece38701cbe0b9401f481b13c
Merge: eaa3d3ce261 9e01a11e67c
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 8 21:47:52 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 3c76435341d943764ecafb324971a254c95b39df
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Sep 8 16:40:32 2015 -0400

    Working parallel opt pass

commit eaa3d3ce261db5812277ba6cd250ce501f77849c
Merge: d9eeab4f9c8 3d88beedefc
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 8 17:14:47 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit d9eeab4f9c8bd662a771d87e73f61165c12cd14b
Merge: f09f6e7a51b 7e316839810
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 8 15:49:47 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 5f20c20dcf53f27e56915263e99d810bbf403697
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Mon Sep 7 22:05:31 2015 -0400

    Semi-working cilk pass

commit f09f6e7a51b1b270a48d2f66312ff282f1ad6959
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Sep 4 12:13:17 2015 -0400

    Fixed build problems with last merge.

commit 8b666563572297a50f9a17efbd060e8f780f0f04
Merge: abe3f70de04 2354b37ae03
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Sep 4 11:40:09 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 41ddcdf5d8e40544ece73167368487f0195b1b5f
Merge: fea705e7114 abe3f70de04
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Sep 1 23:17:47 2015 -0400

    Merge branch 'tb-scratch' of github.com:taekwonbilly/Parallel-IR into billy-scratch

commit fea705e71145c13d37dcedf6b260ed38d75b7ad1
Merge: dd9331be0b0 19481e914d1
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Sep 1 17:14:52 2015 -0400

    Merge branch 'tb-scratch' into billy-scratch

commit dd9331be0b0f2c6172666774f3f9d3fb17121154
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Sep 1 17:13:27 2015 -0400

    Commit detach pass before merge

commit abe3f70de0450a6ff4d169e2f8a7c884f38b5b43
Merge: 61fde862bba ac515c40878
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 1 16:59:07 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 61fde862bba820f143ea0545dc1804fe53523efc
Merge: 19481e914d1 9907691f42a
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Aug 30 09:37:44 2015 -0400

    Merge branch 'billy-scratch' of github.com:taekwonbilly/Parallel-IR into tb-scratch

commit 19481e914d1b1c4ee1db106d8f01b986ba4f90ae
Merge: fadec4720ee 2b5188b98a3
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Aug 30 08:51:34 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit fadec4720ee7b66c5f4a362e2f0e0e8b2c127ce6
Merge: 4fcaa4205d2 43928f79096
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 29 12:03:38 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 4fcaa4205d29c0c7c96d5e422f16db53db786e82
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 29 12:03:11 2015 -0400

    Fixed bug where JumpThreading would attempt to split reattach edges.

commit 6342321c427d73af4fafe79c88d60d5945d192e2
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 29 12:02:17 2015 -0400

    Fixed bug where SCCP did not recognize detach/reattach/sync.

commit cd5c25c6646f9fa4472be7f4148e938b3db180fc
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Aug 28 18:12:45 2015 -0400

    Removing dead code from SROA.

commit 613e58985cd9077134dc120d465bbf4ad7c624b1
Merge: 16929701716 21f084aa722
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Aug 28 18:07:45 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 9907691f42ac9a51278d9a4fb20496f1a08531cc
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Thu Aug 27 14:58:29 2015 -0400

    Add temporary hack to enable compiling serial version to executable

commit 42a2eef9caf19027aac8829f2e90cc3194e87fe4
Merge: 703f88a7461 2d184c72270
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Aug 26 16:57:08 2015 -0400

    Merge branch 'tb-scratch' into billy-scratch

commit 16929701716110895498f4d5528c740355545472
Merge: 2d184c72270 4abce6e698a
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 25 14:00:34 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 2d184c7227076f1843ef28ab46c9a6736cb5faea
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Aug 23 11:49:32 2015 -0400

    Relaxed need for commutativity in serial TRE.

commit aecdc8f291e3faa379ec24337be337095a685ea0
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Aug 23 11:49:03 2015 -0400

    Fixed bug in BitcodeWriter with reattach causing opt to crash when emitting bitcode.

commit f7f1cce493e65e181225f5d439cfdc1798717e2e
Merge: 45d7087de1c 8724a428dfd
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 22 09:50:44 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 45d7087de1c8bc1360e107a30c937d9b24189f49
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 22 09:43:10 2015 -0400

    Draft enhancement to accumulator TRE to use identity values.

commit 85eda242bd0b50027d4859450206d336e3e585f5
Merge: f135205b97a 0d125ca11e9
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 18 11:25:00 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit f135205b97a8352113ff27c8fa8158aade75254a
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 18 10:40:50 2015 -0400

    Adding 'getIdentity()' method to Instruction to enhance serial TRE.

commit fe40d5f2a3d392c9836968fb0c8ba3df1ebc908c
Merge: d3cdbb9137e 378e97e50c4
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Aug 17 08:52:52 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit d3cdbb9137e07f806ce69ae7f327749694b7b8b2
Merge: 653d0bbdd47 126b405bec6
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 15 11:33:43 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 653d0bbdd47f7c8520941a9ea1ca5ce2d431bda5
Merge: 99611974297 26e17390798
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Aug 14 09:25:49 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 99611974297552647706e46eb290be13f1ee6a82
Merge: 4bf70c75ac9 22af77d94f3
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 13 12:36:36 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 4bf70c75ac9f1d5eee6b5c2cbfbdb9b5d0de8f3b
Merge: 4dec88872b7 a5ccfee2752
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 11 13:31:22 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 4dec88872b7e91e3f680a304b98ee3f197f5e9db
Merge: e2aac9890d9 abdf937a221
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Aug 10 12:53:34 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit e2aac9890d934cff8b1f09d5c31fa6c804b80bb0
Merge: 8b8574d13a1 73b16a70f16
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 8 09:12:17 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 8b8574d13a13cab91984d55cb78ebfae7caaf941
Merge: 2ee8648835e 1962b1b6b7e
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Aug 7 09:04:50 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 2ee8648835e211ba1a93501acb5ece9f3d5d406b
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 6 08:53:21 2015 -0400

    Bug fix on marking Sync instructions as potentially reading or writing memory

commit 156cf024ecde0d1a725e32239c3057c71297fcfa
Merge: 7d823a9c882 7809bb2e968
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 6 08:08:36 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 7d823a9c882be773768c6c38d92cad7da9880b2f
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 6 08:08:02 2015 -0400

    Adding SyncInst to set of instructions that may read/write memory.

commit 383d9f685189d8294df1f988e7b2c328b2227873
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 4 15:15:40 2015 -0400

    Fixed typos from previous merge.

commit 90a25b1e5633c00cec6a5dd77b998aeb9bfbfc19
Merge: 7907e1dbfd7 a639e155a28
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 4 14:30:25 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 7907e1dbfd714cc121978597e0e552b1aa6eb195
Merge: 9819737b739 c71235ab7d7
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Jul 31 08:49:30 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 9819737b7396013f3d6dee738b070f11b1a52e8c
Merge: 2c1c7bc0320 dc9125e8d13
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 29 08:48:13 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 2c1c7bc0320cf3d5a74b2ad8cf91f24fa641da97
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 29 08:47:45 2015 -0400

    Adding function to SimplifyCFG to elide detach statements whose continuation immediately syncs.

commit c950f20aa21eca8300eed7b10f98e4b61109311d
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 28 10:48:23 2015 -0400

    Added optimization to remove trivial reattach blocks.

commit 86df0ba3770a03a8271a5bba7f1a3708b3f0d153
Merge: 3fbb3bcf4cb bf26b3fcaec
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 27 08:22:30 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 3fbb3bcf4cbbe96c286774917025664dd8e2de80
Merge: 7bb5864b2ad 52f969b0298
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jul 23 08:57:48 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 7bb5864b2ad318eb969b7f8d78e6d5171a8b9cbc
Merge: 9a2143e2643 717d8ad6cf4
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 22 08:02:57 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 9a2143e26433557e7f1eac221099bd037e487e80
Merge: c9d4623ac37 c721349466d
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 21 08:42:46 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit c9d4623ac37b0ba06e727dc71df3ec559a267762
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 20 15:56:29 2015 -0400

    Don't perform PRE across a detach or reattach, as it requires splitting a critical edge.

commit e1df337ce92636114885f3268afaa571e279bcb2
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 20 15:55:53 2015 -0400

    Detach-reattach pairs create unsplittable critical edges.  Add some asserts to check that we don't try to split those edges.

commit 48ec13d545fde4c80f86132b330dec9c672c29b3
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 20 13:32:44 2015 -0400

    Minor edit to instruction combining to avoid pessimization of moving code after a sync.

commit 46d9cfe4c634c7229c16623ca17f0b27d3c7ad28
Merge: c99bacd4cec 96d9043a78b
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 20 10:53:45 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit c99bacd4cecc8f6a9b0f159d957c81ca90a53c06
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 20 10:50:26 2015 -0400

    Updating existing optimization passes to generate correct code around detach/reattach/sync instructions.  Tested on fib and simple race example codes.  Some passes, such as redundant instruction combining, are still pessimizations for these parallel codes.

commit bf96714f54abff14ce58abec408cafb5367ab0fe
Merge: c8594201bba 591adee23bf
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Jul 17 09:14:14 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit c8594201bba51bac12bf581ec5f11aff5e767f9b
Merge: c10991b43d5 72400f8d508
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jul 16 08:59:20 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit c10991b43d5dedafd23d7579635da4e111fd598c
Merge: 1d47de608d6 4aa2f4514cc
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 15 08:16:18 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 1d47de608d6e59908f715569137f5e2dac1f339a
Merge: 3a70241cdea 815d6131a4d
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 14 09:19:14 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 3a70241cdea09232c8e26cfe42e56fac598ed8ba
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jul 12 08:24:32 2015 -0400

    Updated PromoteMemoryToRegister to properly handle reattach, specifically, to avoid promoting alloca's if doing so would require a Phi node to inherit register state through a reattach.

commit 51d54d96cc3cdaec661ea2268e8dd6294b22375a
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jul 12 08:23:11 2015 -0400

    Adjusting reattach to look more like a branch.

commit d39d1f75be719678706e403c64d1a53f9387ef98
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jul 12 08:22:20 2015 -0400

    Updated comments in IRBuilder

commit 08f1f890d00a14f4ffccdf7da44b8c7b0e5daa12
Merge: 3fa3c489669 1e3fa768c01
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Jul 11 07:59:42 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 3fa3c489669220cef599f61adb52c0b3eba4bc0c
Merge: 48100a712f7 e57b60a7f96
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Jul 10 08:48:24 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 48100a712f7dddd6578ec0f93fd55ae5ddc033ce
Merge: 72a88786c60 86b4ed2fc40
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jul 9 09:01:23 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 703f88a7461d9741c5d0203c02e702f48cda74e7
Merge: 5c355339f57 8e3d42ecb81
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 8 21:51:37 2015 -0700

    Merge branch 'tb-scratch' of github.com:taekwonbilly/Parallel-IR into billy-scratch

commit 5c355339f57181fbf8ce8e665ce4a5e1b18a6a35
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 8 21:50:21 2015 -0700

    fix merge error

commit 72a88786c604e0c99dace11e7ab02b9bea53c7c4
Merge: ab1078ca539 080d7a819f4
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 8 07:54:34 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit ab1078ca5394d4a132b9bfef2b45fe9936355c62
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 7 23:59:33 2015 -0400

    Rework reattach to take a basic block as an argument.  Reattach is therefore like a break, while not being a break.

commit 189cbf6873ffb4880a10098341abdc18447d38d3
Merge: 8e3d42ecb81 7b7c81cd353
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 7 13:45:38 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 8e3d42ecb81ed3d9b8a9bc45e17ba151aaba45fc
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 7 08:58:35 2015 -0400

    Initial hack to disallow SimplifyCFG from removing sync instructions

commit 738e14f4a4dcb70e7e9e6ff1e0727b05ce14c008
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 7 08:57:47 2015 -0400

    Fix comments on SyncInst

commit bf1508cc4427479a10092210237db6678c1ef6d5
Merge: 19e947bd14f 2822246ecee
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 7 08:18:24 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 19e947bd14f9b9e718ab634481a0a0d96962b216
Merge: de195a8462b a25ee390b55
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 6 08:24:00 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit bb180502346ef66f459596d49bc26c15dc822f88
Merge: c6662084d9e de195a8462b
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Sun Jul 5 21:36:33 2015 -0700

    Merge branch 'tb-scratch' into billy-scratch

    Conflicts:
    	include/llvm/Bitcode/LLVMBitCodes.h
    	lib/AsmParser/LLParser.cpp
    	lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    	lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
    	lib/IR/Instructions.cpp

commit de195a8462b52201728b19904a7ff895e3c2b8a0
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jul 2 15:04:46 2015 -0400

    Temporary sync instruction, in order to develop dominance analysis for CFG's with parallel control dependencies.

commit 738db4461c0b4305c31f9feab72003012c2dcea8
Merge: 02ff4acf5a2 e4e6f29c93d
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jul 2 08:52:48 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 02ff4acf5a260ed830edf5f0764c49f3ce5bdfda
Merge: 999aed1e3d0 7e6843cbd68
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 1 09:43:34 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 999aed1e3d0c0df3d4d3d8f5b4ebbe7181834cba
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 1 09:42:21 2015 -0400

    Teaching SCCP about 'detach' and 'reattach', such that optimization passes can run on codes with these IR instructions.

commit d2f3f1e9b8c80feb8621e3897998a24c68365bed
Merge: ea299f63c15 37cb5f1c2db
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 1 08:33:51 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit ea299f63c158dd1b90dcef36729f361c69f54505
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jun 30 08:27:46 2015 -0400

    Renamed 'spawn' to 'detach'

commit f88a6553ebb86f8d5304a7b8df238b2274d936cd
Merge: 3b6df76c9a1 a5106ca54d0
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jun 30 08:03:44 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 3b6df76c9a154c3ae22fe89569dfdac23637d12b
Merge: e62bd55cd9d e8f07a7eb39
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jun 29 09:50:59 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit e62bd55cd9d749090f9137363ae55ada11a2eb4e
Merge: 4dc79856c77 43e99f618db
Author: William Moses <taekwonbilly@gmail.com>
Date:   Sun Jun 28 16:13:19 2015 -0700

    Merge pull request #1 from taekwonbilly/billy-scratch

    Add reattach instruction

commit c6662084d9ecac843815ba39681d6ad2b3c3faaf
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Sun Jun 28 16:09:53 2015 -0700

    allow to compile

commit 43e99f618db80683c40b98110a9320fb88f2b75f
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Sat Jun 27 13:20:11 2015 -0700

    add token

commit 1a4a51b9510224c583acc08555807713a26277e2
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 25 22:35:46 2015 -0700

    Reattach

commit 5861430d7fe8c36d01e42d5a79765232d3733a55
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 25 22:13:27 2015 -0700

    Reattach

commit 4dc79856c77887cd506b15fee5793608071c7b0d
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jun 25 14:08:59 2015 -0400

    Cleanup: remove unnecessary space

commit 592fd5576cc26e3a0ba7efe4918b0c8f94c54b0f
Merge: 08297c02e75 eebe475623c
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jun 25 13:49:16 2015 -0400

    Merge remote-tracking branch 'origin/tb-scratch' into pir

commit 08297c02e75ec7416751d443a99239d464c90061
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jun 25 13:46:26 2015 -0400

    Porting spawn instruction to current LLVM master.  Added CreateSpawn to IRBuilder.

commit eebe475623c877375a6718b362a76e2bd8843e11
Author: TB Schardl <neboat@mit.edu>
Date:   Wed May 27 10:00:32 2015 -0400

    cleaning up directory to support compilation on my system within a separate build directory

commit 41059692e83eacd80f90f7df15510f97ae7c679d
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue May 26 18:27:48 2015 -0400

    fix

commit 21846df31a8b5b45b82781e8f8a6eb8c9c8dcb0f
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue May 26 17:55:07 2015 -0400

    rm scruff

commit d3d85e53fb33660f44a60f9e1c04c133596a7344
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue May 26 13:13:00 2015 -0400

    cleanup

commit 8cc15c93dcee39782e92168f85e67fb7db46d069
Merge: 218888afe22 f3fb567248e
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue May 26 13:11:33 2015 -0400

    Merge branch 'master' of github.com:taekwonbilly/Parallel-IR

    fix issue

commit 218888afe22e6c297f19a5641809492429be18a7
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue May 26 13:09:58 2015 -0400

    fixed

commit f3fb567248ece821dd2cd77008d1be0c385e78b0
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed May 20 16:45:43 2015 -0400

    ud

commit 8721e720eeb689bf1e9f3f401a4aa851725cc126
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Apr 15 09:57:41 2015 -0400

    reset

commit b1dd73bcb3e3adc89c78acf620b81a7271f261b3
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Mon Apr 13 10:13:29 2015 -0400

    Last commit before change syntax

commit 5cdcb6294493acf8bf10274c3a4a6f1f70c6de36
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Mon Apr 6 12:05:40 2015 -0400

    Updated llvm / added Future Type

commit 2ce961b4e05eab9cb04b59e73ee1209b74e39524
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Apr 1 11:04:13 2015 -0400

    update llvm

commit 5a8e342deb6ff3f9535890096b76731028740219
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Mar 17 22:55:24 2015 -0400

    Added llvm src

commit 140e15b2bddcc72a1a07b1dce8b84ae00f371e55
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Mar 17 22:11:09 2015 -0400

    first commit

Bug fixes from previous merge

Updated clang submodule

Code cleanup to reduce diff against mainline LLVM.

Additional code cleanup.

Fixes to address several failing LLVM regression tests.

Based on the SyncElimination tests, specifically "for2.ll," it appears
that SyncElimination removes sync instructions that are not safe to
remove.  One relevant test has been updated to note this problem and
marked "XFAIL."

[CodeGen] Reverting an earlier change to SelectionDagISel for Cilk codes.

Previously, to fix an “rbp/rsp issue” with Cilk codes,
SelectionDagISel was changed to set a flag in functions that expose
“returns twice”, in order to make those functions appear to contain
variable sized objects.  Setting this flag causes LLVM regression test
“CodeGen/X86/setjmp-spills” to fail.  Setting related flags, such as
“HasOpaqueSPAdjustment” through their existing public interface also
causes the same regression test to fail.  In addition, I don’t see any
rbp/rsp issues with Cilk codes when SelectionDagISel is does not set
any such flag.  For these reasons, I'm removing this previous change
to SelectionDagISel.

[Tapir] Adding test to verify that LoopSpawning properly handles parallel loops in Tapir whose body reads the loop limit.

[PassManager] Reworking Tapir modification to PassManagerBuilder to ensure that Sanitizer instrumentation passes run only once.  This change should also help improve Tapir's compatibility with LTO.

[Tapir] Updating simple LoopSpawning test.

[TSan] Reverting change to TSan instrumentation, which was causing a test to fail.  We will need to introduce a new instrumentation pass specifically for CilkSan.

Updated clang submodule

[CilkSanitizer] Added custom instrumentation pass for CilkSan.

[ThreadSanitizer] Removing old change to ThreadSanitizer for use in CilkSan.

[CilkSanitizer] Added instrumentation of memory intrinsics and atomics.  Added simple optimization to elide instrumentation of non-captured pointers in serial functions.

[Tapir] Removed requirement to unify returns in all functions for Tapir lowering to Cilk ABI.

Updated clang submodule.

[CilkSanitizer] Improved analysis for avoiding instrumentation.

[CilkSanitizer] Allow CilkSanitizer to handle a larger variety of memory access sizes and to properly ignore memory accesses of illegal sizes.

Squashed commit of the following:

commit 9eef73e8b7b5dab5d8e04a0fa584fd765e5b1d13
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Aug 4 01:43:13 2017 +0000

    [TRE] Fix bug with Tapir modification of TRE that was causing unit tests to fail.

commit 92b16128f980b6683cb53a324480d7305f4327d4
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 3 13:10:01 2017 +0000

    [README] Attempting to clean up README file.

commit fa242e0f01133707c3a483cfabedf3ee28abba7a
Merge: a8e2b795fb3 f55a27066ac
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 3 12:52:13 2017 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit a8e2b795fb34c87cd2c884235c3b50be0c17c3e7
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 3 12:49:10 2017 +0000

    [README] Updated README.

commit f55a27066ac03e39e6a01ca30e86bc48df76fa7e
Author: William S. Moses <gh@wsmoses.com>
Date:   Tue Aug 1 20:17:47 2017 +0200

    Add CircleCI

commit 964b5bea84c59cdc7e27bc07e98f12edc821c4fc
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Aug 2 21:35:11 2017 +0000

    [LoopSpawning] Correctly handle Tapir loops where the loop body uses the variable storing the number of loop iterations.  Fixes #13

commit 8d4f443d9c9b78478279d598c4eb9abd79db1e59
Merge: 452aac7e148 ef122d645a8
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Aug 2 21:35:22 2017 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 452aac7e14852491121f7ca26f24f420414a5245
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Aug 2 21:35:11 2017 +0000

    [LoopSpawning] Correctly handle Tapir loops where the loop body uses the variable storing the number of loop iterations.  Fixes #13

commit ef122d645a83c9ad9ee743329208ee001071a4f2
Author: William S. Moses <gh@wsmoses.com>
Date:   Tue Aug 1 20:17:47 2017 +0200

    Add CircleCI

commit 9be75a22ad015c307665d277994651671a15ae60
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 10 15:57:49 2017 +0000

    [CSI] Bug fixes and refactoring of the CSI instrumentation pass.

commit 6ce5f2f27b1bc2d92e48420376c2a37d1608f3a1
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 10 13:37:39 2017 +0000

    [Tapir] Allow Tapir lowering to Cilk to fill in missing definitions of internal Cilk types, including __cilkrts_worker and __cilkrts_pedigree.

commit 631e4626d2ba614eaf8a68113c2fdf02f9f8e246
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Jun 30 21:33:54 2017 +0000

    [DetachSSA] Initial implementation of an analysis pass that tracks the creation and synchronization of detached tasks.  This analysis is based on MemorySSA.

commit 923a9052c95c43df1405fad56f2cb1ef12a47412
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jun 27 21:54:51 2017 +0000

    [Tapir] Adding support for sync regions.

    A sync region is designated by a token emitted by a call to
    @llvm.syncregion.start.  The detach, reattach, and sync instructions
    all take this token as a parameter.  A sync instruction in a sync
    region SR only waits on computations detached from detach instructions
    in the same sync region or in a detached descendant thereof.  By
    convention, a call to @llvm.syncregion.start occurs in an entry block,
    that is, either the entry block of a function or the entry block of a
    detached sub-CFG.

    For Cilk programs, a sync region is started for any function that
    performs a _Cilk_spawn or _Cilk_sync.  A separate sync region is
    also started for each _Cilk_for in the function.

    Sync regions address two issues with sync instructions.  First, with
    sync regions, the implicit sync at the end of a _Cilk_for only waits
    on the parallel iterations of that _Cilk_for, not on any other spawned
    computation within the function.  Second, when a function is inlined,
    any _Cilk_sync performed by that function will not erroneously wait on
    detached computations in its caller.

    This commit includes simple cleanup passes involving sync regions.
    One form of cleanup removes sync instructions in sync regions that
    contain no detach instructions.  Another form removes empty sync
    regions, i.e., calls to @llvm.syncregion.start whose produced token is
    never used.  Future work will analyze sync regions more carefully and
    combine them when it is deemed safe.

commit 9b55aac80aca2a520ba7627a020af413be18a29f
Merge: 9b5abba8e85 eece7bcb178
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Jun 3 12:42:01 2017 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm

commit 9b5abba8e85b01c08d49885fdc6d871ed0e522e9
Merge: 51a4df5f3e5 6ef5e10ad7e
Author: TB Schardl <neboat@mit.edu>
Date:   Wed May 31 02:07:52 2017 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm

commit 51a4df5f3e536a65c0a926ee7c87eb47c80aec7f
Merge: 6f69cdf478c 0559b4fa45c
Author: TB Schardl <neboat@mit.edu>
Date:   Tue May 30 18:19:52 2017 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm

commit 6f69cdf478cc2801c74964e3a233ad46d16245cc
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon May 15 01:15:30 2017 -0400

    remove Rhino print

commit d719d172fd8967cccb6625ff1ec54e439cdfe989
Merge: d2b4d301879 2db0ffd4753
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon May 15 01:04:30 2017 -0400

    Merge branch '6898' of github.com:wsmoses/Parallel-IR into 6898

commit d2b4d301879c0a75cbbd9d7c49e51581543ff08b
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon May 15 01:04:14 2017 -0400

    pushing rhino flag

commit 2db0ffd47534ee35deaea877d73d8484cb94c01f
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Mon May 15 00:24:54 2017 -0400

    spawn unswitch

commit 8f57e0739bf9fc6736472c89f91a533630efd5c3
Merge: 9660ce4abc0 be7eafc7179
Author: William S. Moses <wmoses@mit.edu>
Date:   Sun May 14 17:36:17 2017 -0400

    Merge branch 'master' of github.com:wsmoses/Parallel-IR into 6898

commit 9660ce4abc060598a20b7c5d30a217bdc3af569e
Merge: 002fb57bb06 780934e4b6a
Author: William S. Moses <wmoses@mit.edu>
Date:   Sun May 14 17:35:58 2017 -0400

    Merge branch 'master' into 6898

commit 002fb57bb069f18319ceab0d287c22166999a766
Merge: 35669cce54f acefa6d5a77
Author: William S. Moses <wmoses@mit.edu>
Date:   Sun May 14 15:32:41 2017 -0400

    Merge branch '6898' of github.com:wsmoses/Parallel-IR into 6898

commit acefa6d5a77cad0cb2da8f5c6cfe3af1ca15129e
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Sun May 14 14:58:08 2017 -0400

    spawn unswitch

commit be7eafc7179b8591b0007a25a2e3aae31cfc7818
Author: TB Schardl <neboat@mit.edu>
Date:   Tue May 9 21:34:49 2017 +0000

    [Mem2Reg] Updated Mem2Reg to find the entry blocks of the function and all detached sub-CFG's more efficiently.

commit 12f929ae136d57fd9e744bc2dac8c072d01e2053
Author: TB Schardl <neboat@mit.edu>
Date:   Tue May 9 21:15:58 2017 +0000

    [CilkABI] Marked additional loads and stores to CilkRTS stack frames as volatile.  Fixed bug in extracting exception-handling exit blocks for detached CFG's.

commit 9bf9a4d58c9f3a09164b8a86202bcee2f5abf553
Author: TB Schardl <neboat@mit.edu>
Date:   Tue May 9 21:14:33 2017 +0000

    [InstCombine] Fixed bug to prevent InstructionCombining pass from sinking operations that read memory across Tapir instructions.

commit 719872be7ce9d8cdbc7036c6eb7d3d77ebeff5cf
Merge: f63b0fed940 10826f2652f
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Fri Apr 28 20:39:49 2017 -0400

    Merge branch '6898' of github.com:wsmoses/Parallel-IR into 6898

commit f63b0fed9406ac9f5f8b54626a9c6ef965cceaba
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Fri Apr 28 20:39:34 2017 -0400

    pushing measuring scripts

commit 991ca791848c9936677a0b7184a77cf0eaf6734d
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Apr 26 12:17:07 2017 +0000

    [LoopSpawning] Cleaning up code for handling exceptional exits.

commit 10826f2652fea87d11ec166954c2d7b02917c21d
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Tue Apr 25 23:24:56 2017 -0400

    Alters sync elimination pfor microbenchmark.

commit 9d5172300fcd2528dc4db210beccfa6cecb7816f
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Tue Apr 25 23:07:07 2017 -0400

    Makes LoopFusePass work.

commit 46720980313325bf80262b8fd447db8e90f1c307
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Apr 26 00:10:42 2017 +0000

    [LoopSpawning] Bug fix to find all exception-handling exit blocks of a Tapir loop.

commit 48e7791f51c0a3b0fc27cc280e458892dac30fbd
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Apr 25 01:30:48 2017 +0000

    [Tapir] Preliminary support for C++ exceptions on Linux.

commit 4613a6461de60516a6242270e4c6cd7beb1c5bec
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Apr 25 01:28:09 2017 +0000

    [CSI] Updated CSI pass to support separate property types per IR object.

commit d5331895cb2d1437b7788469ac72c731b65a949b
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 15:21:03 2017 -0400

    Have makefile for sync_elimination_pfor_mb emit .ll for the sync eliminated version.

commit 3b2b3c3429af3f1a173970cef45844639d35361b
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 15:09:04 2017 -0400

    Cleans up makefile for sync_elimination_pfor_mb.

commit 21aa2bbee01f1dbc86681a7ed78b7cfd8fd611d5
Author: Bojan Serafimov <boki@mit.edu>
Date:   Sat Apr 22 14:57:32 2017 -0400

    Fix compile error

commit 0c5e6d15f12288dc29e9f08ff9d011c1204f69ba
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 14:45:38 2017 -0400

    Fixes sync_elimination_pfor_mb micro benchmark.

commit a387e9f3e16ab5253eec663bbb56c246e4dbda55
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 14:26:06 2017 -0400

    Fixes SyncElimination blow up with function calls.

commit 44e8409f071578546b572b6dd807a83092867bfa
Author: Bojan Serafimov <boki@mit.edu>
Date:   Mon Apr 10 12:06:51 2017 -0400

    Fix tests

commit adeb3eaaf5af3d9c816db1a704324c9f715a0277
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Mon Apr 10 11:46:36 2017 -0400

    Handles instructions with null call sites.

commit 96f24b65e5a4634c8a78ac0e53dd552fe46d185d
Author: Bojan Serafimov <boki@mit.edu>
Date:   Mon Apr 10 10:19:42 2017 -0400

    Ignore sync instruction in rosetta

commit d874567d6e6cdfc88c0faab3122975046162ec09
Author: Bojan Serafimov <boki@mit.edu>
Date:   Tue Apr 4 19:14:29 2017 -0400

    Add nested loop test

commit 8f7734960776d31ddcb0cf690da837c3f7ee9229
Author: Bojan Serafimov <boki@mit.edu>
Date:   Fri Mar 17 17:39:58 2017 -0400

    Fix bug in FindRosetta

commit e0bac90f990423a17e245cd6cb2d9f9f2b387951
Author: Bojan Serafimov <boki@mit.edu>
Date:   Fri Mar 17 17:03:16 2017 -0400

    Add test cases

commit 7ccc4c9454b80ef03f14a0c03d86fceea2309581
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Fri Mar 17 16:57:54 2017 -0400

    Fixes sync elimination test.

commit b5f16cfaf2ce8c9311104f356522c527cfe0b8ba
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Fri Mar 17 16:51:37 2017 -0400

    Removes incomplete sync elimination test.

commit 344d075d08c6d23be99373b1b65a94fb6f92701d
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Fri Mar 17 16:47:29 2017 -0400

    Removes function renaming in sync elimination.

commit 4045b1f2bd1d4e1ff6527bdc4349d9938e188463
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Fri Mar 17 16:15:20 2017 -0400

    Fixes loop condition error in sync elimination.

commit 7eab317e1436d2fc456f0f625ef4888577c53bec
Author: Bojan Serafimov <boki@mit.edu>
Date:   Fri Mar 17 16:33:40 2017 -0400

    Fix tests

commit 2c6412e1a4bb92a5fc86f63803a52ea22c43aa05
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Fri Mar 17 14:54:13 2017 -0400

    Implements legality check for sync elimination.

commit a57ac4cafdfe845f0c90cc0611705c38f87f1905
Author: Bojan Serafimov <boki@mit.edu>
Date:   Fri Mar 17 16:05:14 2017 -0400

    Add basic SyncElimination tests

commit a7c6bdec1a3562a9333e06497e362ab5e8e45613
Author: Bojan Serafimov <boki@mit.edu>
Date:   Mon Mar 13 11:09:06 2017 -0400

    Implement sync removing

commit 271c65cf91c5a2223ebac864cb55d6137d6d00c4
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Thu Mar 9 16:59:16 2017 -0500

    Implements Vegas-set finding for SyncElimination pass.

commit 72827d0cc4ef8b3fb556bdb4660c6b0891849b4f
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Thu Mar 9 15:58:45 2017 -0500

    Implements Rosetta-finding part of SyncElimination pass.

commit df4c672499f76bcbfdf93806755e6f9ff15035f6
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Thu Mar 9 15:08:28 2017 -0500

    Cosmetic cleanup.

commit 2682b3bf34c4efd7fc86e0af26d3a0b1dffc108f
Author: Bojan Serafimov <boki@mit.edu>
Date:   Wed Mar 8 00:52:22 2017 -0500

    Add SyncElimination pass

commit 3856a31e3af623255498bc878b750e82c90a34b7
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 16:27:38 2017 -0400

    Enables LoopFuse by default.

commit 6017d8b2a125a66cb418d247281433a5665ab249
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 16:27:26 2017 -0400

    Rebases LoopFuse to compile on the current code base.

commit 367d9d916cbaf9d2433d267bf9c70be772fe8af7
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 16:04:20 2017 -0400

    Replaces LoopAccessAnalysis with LoopAccessLegacyAnalysis in LoopFuse.

commit bb0b29851651bc1d122b7aed839a58edb4e656ce
Author: Jiahao Li <isundaylee.reg@gmail.com>
Date:   Sat Apr 22 15:40:47 2017 -0400

    Applies https://reviews.llvm.org/D17386 for Loop Fusion Pass.

commit 3ce522e822ad2a0b047c0cc905cf59b8f4247d26
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Sat Apr 22 14:11:36 2017 -0400

    pushing spawn work

commit 0dd0df9b42bac64d82ffe5035f6d4f5d7b2dd2b0
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 30 12:40:37 2017 +0000

    [PassManager] Re-enabling passes that happen after optimizations when Cilk is not enabled.

commit 511ba02c8ccb2bf15a0791007229389352bffef9
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 16 14:25:49 2017 +0000

    [Tapir] When outlining, propagate available alignment information to the parameters of the outined function.

commit 4722cecdb2cef0b0ab84c08f65ae296bb4c01a2f
Merge: 285ff461789 780934e4b6a
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 10 20:18:23 2017 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 285ff4617892da4132f4a0aded992dcc4c5af6d5
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 10 20:17:05 2017 +0000

    [Tapir] Fix to properly maintain allocas in the entry block of a detached context.  These changes ensure that every detached context has an entry block with just one predecessor.  These changes also move allocas among entry blocks during function inlining and the outlining process for lowering Tapir.  These changes also remove syncs associated with parallel loops after outlining.

commit 489f0a4673d2b0364556382569e421fed347d301
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 10 20:14:03 2017 +0000

    [Local] Bug fix to make the GetDetachedCtx routine to properly return the detached BB at the start of a detached context.

commit cd7e9f3c2d840182ab82830218703b78c657d1b0
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 10 20:11:56 2017 +0000

    [SimplifyCFGPass] Code cleanup and comments.

commit 35669cce54f33447d1f12423e71536ab31cf02e5
Merge: 1fae2a923fb 52889bc3118
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Mar 8 11:33:46 2017 -0500

    Merge branch '6898' of github.com:wsmoses/Parallel-IR into 6898

commit 780934e4b6a8054900b774d9405c0dd426bd23be
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 18:08:44 2017 -0500

    Parallelize / Shorten compilation

commit 4cc8071621e2c159a755a594bdb5dde9fbdfe74d
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 17:37:28 2017 -0500

    Fix optimized llvm build

commit 26007676a05e6c0445a0971f5bbfb0a2b2e9c47b
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 17:31:40 2017 -0500

    Updated binary

commit 6917c16e028fb03a608ba2e2f33ce48c68900b92
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 17:21:27 2017 -0500

    Faster cmake and autobuild matrix

commit 088941d05808f63865028347f4fcd3cbc849ce08
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:56:44 2017 -0500

    Remove old cmake

commit c558e05a3917b7be37490cd45b6c2d9fc153adbc
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:55:17 2017 -0500

    Print directories for debugging script

commit 074121e15927e674b16e2656913ecd08d557a422
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:45:52 2017 -0500

    Leave directory in autobuild after cmake

commit 30a221e0a04ae4dae0575a092800799e7aa7792f
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:38:07 2017 -0500

    Build without parallel option

commit 7a7d719c26e78e049093f1869eb6573e7cb3e529
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:32:07 2017 -0500

    Build newer cmake from source

commit 24f129bf4857357c90f8458c2ce09b60ab112b36
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:24:00 2017 -0500

    Correct ppa

commit e2bc0fc2d7edc08fb427b6f0a30862c602e57dfb
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:21:28 2017 -0500

    Change CMake to sourceline

commit c6249f0bce0d9906f5d669c6d44d15f5977e09d3
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:16:37 2017 -0500

    Attempt newer CMake

commit fe47a0078d432ee911504fa05c1af0652122dce7
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:08:27 2017 -0500

    Build PClang along with Tapir

commit 8ee564cae3bbb672546427bab5137b90ce2fdc17
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:07:36 2017 -0500

    Build intel runtime using the Tapir compiler

commit 6750684c7007e0e6ea0300498e7196cf68c52176
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 16:00:50 2017 -0500

    Add configure to cilk runtime building

commit 3f3b46840218f1629f1183b1ef0772414ca145c2
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 15:57:18 2017 -0500

    Add make to dependency list

commit bd6f8df75f130bcf260fc4a3102d73341d21dc1b
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 15:54:50 2017 -0500

    Add cilk runtime building

commit 6372499258146bf9da15f0153c9e4f4d288578cc
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 15:42:22 2017 -0500

    Change autobuild cmake version

commit 9fec173620bf1c3c964292485f007a69fc05ca72
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 15:39:43 2017 -0500

    Change autobuild distribution

commit 1fae2a923fb632a6eb1dabc4826e3b2533735273
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Mar 7 15:35:20 2017 -0500

    Relist as package

commit 52889bc31182f3faebcfce24918670967b5b96f6
Author: Douglas Kogut <dkogut@mit.edu>
Date:   Mon Mar 6 12:11:10 2017 -0500

    pushing example opt pass

commit fe692e250aa8a78435200882ebb89c17f881c4d3
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 3 13:25:57 2017 +0000

    Ignoring debug build directory.

commit 69fa592b7e889be513f1004b1f13dd450a1be378
Merge: 3c56ed06c17 df445de9e82
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 3 13:20:52 2017 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 3c56ed06c17f764e2c1221df60e8ee45199b1577
Merge: 4611d796dea 2d562fe758b
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Mar 3 13:19:05 2017 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm

commit df445de9e8252e5aff8a6d7645128df71b3bd45f
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Mar 2 00:37:50 2017 -0500

    Correct CI build script

commit efa60d2d710c5697f6be5737898897cfb56b4509
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Mar 1 16:07:01 2017 -0500

    Force travis-ci to rebuild

commit 66ed989e47c276699462c761b0e4f2b68ef5d951
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Feb 28 16:18:35 2017 -0500

    Initial attempt at adding Travis autobuilder

commit b8a1f3fb7874d52fedb6db8a786695521a846709
Merge: 518873a5b44 a3bd7557fb6
Author: William Moses <taekwonbilly@gmail.com>
Date:   Tue Feb 28 11:49:18 2017 -0500

    Merge pull request #12 from YingVictor/master

    [LowerToCilk] Fix memory leak.

commit a3bd7557fb661ef0980599d430e7cd0a52f7f385
Author: Victor A. Ying <victory@csail.mit.edu>
Date:   Tue Feb 28 11:41:08 2017 -0500

    [LowerToCilk] Fix memory leak.

    SmallVector of NewHelpers needs to be deleted.

commit 518873a5b44c8ffc37282cb3887a1518525eca7f
Merge: 645daf3405c fb71c4aa6b4
Author: William Moses <taekwonbilly@gmail.com>
Date:   Sun Feb 26 17:29:34 2017 -0500

    Merge pull request #11 from YingVictor/master

    Two minor fixes

commit fb71c4aa6b408ce59e095b3d770ba01ab4eb9f51
Author: Victor A. Ying <victory@csail.mit.edu>
Date:   Sun Feb 26 16:53:55 2017 -0500

    [include/llvm-c/Transforms/Tapir.h] Fix function name mentioned in comment.

commit 2e658275b9935e536f86aec6b7f911b6c5e374cc
Author: Victor A. Ying <victory@csail.mit.edu>
Date:   Sun Feb 26 16:46:18 2017 -0500

    Properly remove traces of clang submodule.

    Removing a git submodule requires more than just deleting the the entry
    in the .gitmodules file, as was done in the previous commit. It also
    requires deleting the special directory entry from the git index,
    which should be done using some variation of "git rm", such as:
    git rm --cached path/to/submodule
    Which is what I did in this commit.

commit 645daf3405c01f6e262373a6c849466f09162f44
Author: William S. Moses <wmoses@mit.edu>
Date:   Fri Feb 24 15:35:50 2017 -0500

    Remove clang submodule

commit c9830e69c572885f6bfc7a74179a8e7efb6c851e
Merge: 3ad6c9cb76e 4611d796dea
Author: William S. Moses <wmoses@mit.edu>
Date:   Fri Feb 24 15:33:45 2017 -0500

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 3ad6c9cb76eba2c5fbf7a5c8416ac28793d6455e
Author: William S. Moses <wmoses@mit.edu>
Date:   Fri Feb 24 14:10:50 2017 -0500

    Update clang to stable

commit 4611d796dea964dea884c34cadcef14b256fbe56
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Feb 21 19:46:22 2017 +0000

    [CodeExtractor] Removed unused function from CodeExtractor.

commit 73b2a05f9106a888ae92fbd9d89fd36be310bcce
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jan 15 14:19:32 2017 +0000

    [LoopSpawning] Restored warnings when LoopSpawning fails to transform a marked loop.

commit 710c06b2ffad2727ff751113b90b9905f4a3c845
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jan 15 14:18:54 2017 +0000

    [CodeExtractor] Removing old code for dealing with debug symbols.

commit ab75cf00f520c07d4dafa58328fa809780ac146b
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Jan 13 22:25:29 2017 +0000

    [LowerToCilk] Renaming Detach2Cilk to LowerToCilk, as part of some code cleanup.

commit 2748779e158be086e9fa52300ccd5fcded978044
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jan 11 13:59:02 2017 +0000

    Updated associated version of Clang.

commit 738a76c83c83017faaeeaf959fb0c45b4586b08f
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jan 11 13:31:23 2017 +0000

    [test] Adding some simple regression tests for Tapir.

commit 5b63394d73f1d65ec6e338ed9ba8063895d8ef4e
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jan 9 19:11:44 2017 +0000

    [Tapir/Outline] Fix debug build.

commit df3dcb657228c40bff3ee7cab30944ed9e116021
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jan 9 02:31:01 2017 +0000

    [Tapir/Outline] Minor code cleanup.

commit facf7c87283b30b139fe75fbd4caacfc32c0fb37
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jan 9 02:29:07 2017 +0000

    [Detach2Cilk] Inline __cilk functions into generated helper functions.

commit c32adbf10f18c9a52e10de2e046329f67f635699
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jan 8 22:48:22 2017 +0000

    [LoopSpawning] Code cleanup for release build.

commit 3b460341f6a21344ddbc11100cd75ef079bcd8ee
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jan 8 22:41:02 2017 +0000

    [Detach2Cilk] Fixed creation of Cilk stack frames for release build.

commit 4bcdb952154d0daf4f18384cceda7f72e7b2542d
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jan 8 20:42:48 2017 +0000

    [SROA] Minor code cleanup.

commit 3c73fb9bf4d241c96c31f10c3a89074ffbf30774
Merge: 0d6f0aad70a 18687546b92
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jan 3 19:24:51 2017 +0000

    Merge branch 'new_lowering'

commit 18687546b9276fcb76c619193ee46b93f05a7001
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jan 3 17:18:12 2017 +0000

    [Detach2Cilk] Code cleanup.

commit 2a7c78c09452762cc784ac4cf92381340830a90c
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jan 3 16:59:48 2017 +0000

    [LoopSpawning] Added support for Tapir loops with exit blocks terminated by unreachable.

commit a1af329428f71f12decbe8776e2d9b4d9b377c63
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Dec 31 17:06:01 2016 +0000

    [CSI] Fix formatting of CSI pass.

commit 08b3602ddb14e7bbe7fe78faa7a12c4fbd43e431
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Dec 31 17:05:07 2016 +0000

    [CSI] Add function names to FED tables.

commit 1672db6417856784850c9aaa5f879c1bb5f6f539
Merge: a22c19d21b9 56516028d8b
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Dec 31 14:59:27 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit a22c19d21b991cd92e7f64103166f66f0f89eabd
Merge: 04b71642665 7f580b605b2
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 20 14:25:09 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit 04b716426657e5cf52c69e6e6953492e1e3b7434
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 20 14:09:15 2016 +0000

    [LoopSpawning] Switching LoopSpawning back to implementing divide-and-conquer scheduling directly.

commit c03b7f076ab44c6e37edb033cf1b16950740fca7
Merge: 0cc6919dafd eaf3712d06e
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Dec 19 21:47:05 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit 0cc6919dafdf326efdfa275f66556ad1a9abfe67
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Dec 19 20:34:25 2016 +0000

    [Outline] Cleaning up the code.

commit 747d1e8211d2c6ce8eeee40a79d3f684e9747e1c
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Dec 19 20:30:37 2016 +0000

    [LICENSE] Updated license to add copyright for changes to implement Tapir.

commit 0d6f0aad70ae0b75a4f71567bd098703070c3c56
Author: William S. Moses <wmoses@mit.edu>
Date:   Sat Dec 17 23:15:13 2016 -0500

    add clang submodule

commit 463af403bf33e14b759a60377c95ffe3d1f74382
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 13 02:28:54 2016 +0000

    [LoopSpawning] Keeping two versions of divide-and-conquer loop spawning around.

commit fcae33a06441a48081c463f74d12fc5f6b9ce68a
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 13 02:21:17 2016 +0000

    [PassManagerBuilder] Modification to support more faithful reference pipeline for PPoPP.

commit 6a8c5d26ad24a6f35ca8afcc17f18ea89f790f09
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Dec 11 22:29:25 2016 +0000

    [LoopSpawning] Fixed bug in computing loop count for using Cilk ABI call.

commit b8af887cac2f664ae780631cd14ea2a194ea042c
Author: Ubuntu <ubuntu@ip-172-31-12-183.ec2.internal>
Date:   Sun Dec 11 08:19:56 2016 +0000

    cilk abi loopspawning

commit 217f4eafa2694468cb3817fb65e05b95ddd1d0b3
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Dec 10 20:39:12 2016 +0000

    [CilkABI] Bug fix to allow proper lowering of when a loop is the entry of a detached CFG.

commit 82cb28db1a9877d923da8a038c8f33a9079b6121
Merge: 8a4ac0d5d6e 05bdd2ebfe8
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 21:20:47 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit 8a4ac0d5d6ee455a6000fd60cd37018642a2b5ba
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 15:58:29 2016 +0000

    [LoopSpawning] Refactored to be a FunctionPass, instead of a LoopPass.  More work is needed for this pass to legally add functions to the current Module.

commit 7f96f2c38f8233502a50c6bfd66257be0915ea41
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 15:55:11 2016 +0000

    [LoopSimplify] Modified to ensure that the preheader of a loop is not terminated by a sync.

commit f84012859a7fd293377b87a2c0d95d2cbd75aee0
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 15:53:05 2016 +0000

    [Tapir/Outline] Cleaning up commented-out code.

commit 2e932359c6f63a76e6a040bdf577ca9f162ddd8f
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 15:52:22 2016 +0000

    [BasicBlockUtils] Modified SplitEdge to keep sync instruction in original block.

commit 32aeb36a6f76b69247231a1b57a9b66a32627ed1
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Nov 28 15:50:19 2016 +0000

    [Detach2Cilk] Making Detach2Cilk a ModulePass, instead of a FunctionPass, so it can safely add functions to the module.

commit 6ab23d5f49ab42f2d3074523570cf72cd7ee6d02
Merge: 56598980fc5 52894d83e1a
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Nov 26 17:23:45 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit e189e6c97da75849d75b512dd5513c0ec5a09af4
Merge: 6952888faaa c3bdfe57eb1
Author: Ubuntu <ubuntu@ip-172-31-13-219.ec2.internal>
Date:   Thu Nov 24 17:07:50 2016 +0000

    Bring up to date with most recent llvm

commit 56598980fc58d0bd68e2957eb45371eb23245995
Merge: 6a33185a05c 3e65807a6f1
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Nov 23 18:31:46 2016 +0000

    Merge branch 'master' of github.com:llvm-mirror/llvm into new_lowering

commit 6952888faaaf797beb00934eee0c99f85fbfeea5
Merge: e79c0d93864 e372554cd73
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Nov 11 21:42:16 2016 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit e79c0d93864a579bf6b865802e182a7b80d9ea48
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Nov 11 21:34:37 2016 +0000

    [PassManager] Ensure that extensions to the pass manager that are intended to run last only run once on Tapir programs.

commit 6a33185a05c72739458a92e13a103ed4b3ae4b97
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Nov 11 21:34:37 2016 +0000

    [PassManager] Ensure that extensions to the pass manager that are intended to run last only run once on Tapir programs.

commit 6f2c14afe41e2bb9729976b52734d98f3c99bae3
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Nov 11 21:18:30 2016 +0000

    [LoopSpawning] Ensure that calculation of a Tapir loop limit is inserted at the end of the loop's preheader.

commit e372554cd7396b1facc00f6d5df7d51f89553e31
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Nov 3 23:57:38 2016 -0400

    Remove some debug prints

commit 6baad834b9903206be5830e9a5d81cb8c118dc80
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Nov 3 23:54:44 2016 -0400

    Remove some debug prints

commit 782593d7bcd41736b148b6b128890d31f0d49f10
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Nov 1 14:40:47 2016 +0000

    [LoopSpawning] Cleaning up code and debug output.

commit f604273ecf927017dc48afdae928477f8708e0d5
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Nov 1 14:39:42 2016 +0000

    [Detach2Cilk] Should not need to inline detached helper functions anymore, because Detach2Cilk should properly handle debug symbols.

commit 20d299f2d2839b1f45b6716970f5a99ee821cec3
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Nov 1 14:37:40 2016 +0000

    [PassManagerBuilder] Run SimplifyCFG after Detach2Cilk to clean up cruft left by Detach2Cilk.

commit 1610d83dd9f26a9f47004634f83b7e5a614f46f6
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Nov 1 14:36:49 2016 +0000

    [Detach2Cilk] Fix to ensure that Phi nodes in the continuation of a detach are still valid after lowering the detach to Cilk runtime calls.

commit ea14d8bd01adccba902cdae883625698319b7d61
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Nov 1 04:42:24 2016 +0000

    [CilkABI] Converting Detach2Cilk pass to use new Tapir outlining methods, in order to handle debug symbols more correctly.

commit 1f30c735f929c5821cf575aeea59ee1b6eef3164
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Oct 31 21:56:25 2016 +0000

    [LoopSpawning] Fixed bugs to properly erase loops after performing transformation and to handle preheaders terminated by syncs.

commit a86651dd973a6f0743b4a360396dba6360fc5bdf
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Oct 31 21:54:45 2016 +0000

    [Outline] Cleaning up CreateHelper Tapir outlining method.

commit 31691cd15ae0f76c40420339849f652888294863
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Oct 31 15:38:08 2016 +0000

    [LoopSpawning] Cleaning up LoopSpawning code, and adding output to loop-spawning reports.

commit 51220e44f007bb6b5be02ecbbf2e20840634daba
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Oct 31 15:34:55 2016 +0000

    [Tapir] Renaming TapirOutline to Outline.

commit 6950ba60b07973d535c06f288e0ed30b14d43aa9
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Oct 30 19:19:15 2016 +0000

    [TargetLoweringBase] Dealing with compile warning on TargeetLoweringBase.

commit 581677b179aa2ed89134c8034ac491fae68595f0
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Oct 30 19:18:10 2016 +0000

    [LoopSpawning] Replacing Loop2Cilk with LoopSpawning.

commit 39d404b1998c4c2d3635939c27f85c70e987d70f
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Oct 30 18:54:23 2016 +0000

    [DiagnosticInfo] New method for emitting warning messages for the LoopSpawning pass.

commit 3d834b9e67f2779d2acd2bfd65d0b192561597d1
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Oct 27 21:27:33 2016 +0000

    Updating passes to run around new Loop2Cilk implementation.

commit 35ec023f57f3a240f598d2a9822ec29aedcaf48c
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Oct 27 21:25:43 2016 +0000

    Moving Tapir-specific transformations to a separate subdirectory under Transforms.

commit 3aae9e2c7b3402a3816f5b31a70a9326674c7a9f
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Oct 22 14:40:05 2016 +0000

    [Cilk] Refactoring components for lowering Tapir to Cilk runtime calls.

commit 0a92f963f5978e3f7cd91a1f77a9b3040b4a2baf
Merge: 54f16a4669d fe05c97a9eb
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Oct 22 14:33:05 2016 +0000

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 54f16a4669deaefc6a92a6f098485ee2d02d608b
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Oct 22 14:30:27 2016 +0000

    [Local] Cleaned up formatting to get rid of tabs.

commit a8fade288fdbc1e194b7b0adba5ebdf61f05cb38
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Oct 22 14:28:18 2016 +0000

    [Local] Fix to SerializeDetachedCFG to preserve debug symbols.

commit 5cc10ed3110941799eb681ad00833028ca692193
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Oct 22 14:17:40 2016 +0000

    [Instrumentation] Adding CSI instrumentation pass, copied from https://github.com/CSI-LLVM/.

commit fe05c97a9eb98c01cfaa7a1a5129b0d002e2db70
Author: William S. Moses <wmoses@mit.edu>
Date:   Sat Oct 22 10:00:23 2016 -0400

    Resolve issue 7

commit 4664388bb8c70312e21d321196942924a23955ff
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Oct 19 16:01:28 2016 +0000

    [emacs] Added detach, reattach, and sync as control instructions in LLVM's emacs mode.

commit c0e8f4fe8db4bdac7f84bbf2ce6cb8a73a9252bd
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Oct 17 04:14:35 2016 +0000

    [SSAUpdater] Derive the correct value from detached predecessors.

commit 2abd121b4c25579045347105a56b8383d0cefb9d
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 14 21:46:24 2016 +0000

    [LICM] Fixing compiler crash when LICM attempts to move a store outside of a Tapir loop.

commit 28606d0fb2e4e2bcaf37959292c2a89cedaf7a1e
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Oct 13 02:12:43 2016 +0000

    [AliasAnalysis] Minor formatting change.

commit e5e04d08d7ddad2e021d0744ef52c52048955a2c
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Oct 13 02:08:30 2016 +0000

    [InlineFunction] Preventing InlineFunction from moving alloca's out of their detached context after inlining.

commit 14719bb0513004960e3c8b0571b82981cc2b1239
Merge: 84848c51548 7f4bee18532
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Oct 6 13:53:55 2016 -0400

    Merge branch 'master' of github.com:wsmoses/Parallel-IR

commit 84848c51548b59b6beafa5c90615f36e64500199
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Oct 6 13:53:50 2016 -0400

    Allow full unrolling of cilk for loops

commit 7f4bee185325eebc78533ef450a45e43926da694
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Oct 6 16:51:37 2016 +0000

    [AliasAnalysis] Force AliasAnalysis to fail fast if it finds a detached CFG that reaches its own Detach instruction.

commit a2c6e22dd11c4212dbb64ce15020f677d77ed479
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Oct 4 22:44:38 2016 +0000

    [Loop2Cilk] Fix splitting of loop preheaders that are terminated by sync instructions.

commit 1d1bdcf375abd2e0e83a8500278acc6124bf16f2
Author: William S. Moses <wmoses@mit.edu>
Date:   Sun Oct 2 23:19:30 2016 -0400

    minor modref fix

commit 9ca914a946ee787fa8750a0a622d0f901641f2cf
Author: William S. Moses <wmoses@mit.edu>
Date:   Fri Sep 23 16:12:32 2016 -0400

    fix line info

commit 16395e5ae2ab1cbc17de82c0127680aeccecedc1
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Sep 22 09:08:42 2016 -0400

    Additional clean up

commit af36e03c8282f4c431260dbfe16e3c323c72b82d
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Sep 21 16:56:01 2016 -0400

    clean up unrollinng

commit 87d19e853f283cf9fac9c1e71239e34227fad27c
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Sep 21 16:48:27 2016 -0400

    resolve move to clang 4

commit 79323f66683946df1702005e3071f7fed23f0c3d
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Sep 15 15:06:36 2016 -0400

    fix tre

commit 574835b96b09f8d9b496f17c303b7a3457cd2e1f
Author: William S. Moses <wmoses@mit.edu>
Date:   Thu Sep 15 12:01:49 2016 -0400

    Fix mem2reg bug

commit 88cccc72240abd17a1dec0b2d238686919db7e81
Author: William S. Moses <wmoses@mit.edu>
Date:   Tue Sep 13 17:14:44 2016 -0400

    fix running bugs

commit f449ac224baed049d3a4eecaccaeef7ac0954e36
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon Sep 12 14:10:31 2016 -0400

    fmt

commit 1d618f6fc664f473131fa11d3b5ba495e3d1cbbd
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon Sep 12 14:08:22 2016 -0400

    fmt

commit 05d2fe180fe4980474f8e7317936b312b749e048
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon Sep 12 14:07:24 2016 -0400

    fmt

commit cb166968bc4f79b54e24272b59f935e3239109c6
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 17 22:11:31 2016 -0400

    solid

commit 1be62909730984141b5afbec84c48823735c4429
Merge: c3eb1b7594a e65e275cf2f
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 17 18:01:27 2016 -0400

    Merge remote-tracking branch 'llvm/master'

commit c3eb1b7594a5953a324015aa08f745e31fb0ec65
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 17 18:00:22 2016 -0400

    cleanup

commit 925a26d33e5aa664ed2a950bfac6f123832d28f1
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 17 17:55:49 2016 -0400

    cleanup

commit 8a4aa28bc1ac48d2073507eb365e2461b206f524
Merge: 9ee354913cb 7177ff558c7
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 17 02:54:17 2016 -0400

    merge to mainline

commit 9ee354913cb1d00c79b0173d87e8259db193d73f
Author: William S. Moses <wmoses@mit.edu>
Date:   Mon Aug 15 01:43:52 2016 -0400

    Add race detector

commit 9b7715ebfc3bdd80382cbce7ca724868789c9cd6
Author: William S. Moses <wmoses@mit.edu>
Date:   Wed Aug 10 00:04:31 2016 -0400

    cmake fixes

commit b66e56629e6ddd6895342d281ed510b011cecff1
Author: Ubuntu <ubuntu@ip-172-31-58-98.ec2.internal>
Date:   Fri Jul 29 21:11:20 2016 +0000

    LICM fix

commit c1aabfb01f044642dc9fb4317313d408c3cc39fc
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 27 21:22:20 2016 -0400

    add merge functions

commit 72b025f6f0d254ab7e37e7cabb42e9e27f01ede8
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 20 13:40:34 2016 -0400

    fix dt

commit 39c33184af36efb1af71591940caf1924ace5ac8
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 20 13:34:33 2016 -0400

    fix dt

commit af099d0ad6a6c263f969e2c8b577d8a6c80bd685
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 20 13:14:30 2016 -0400

    fix dt

commit 920d83fc1bed8c82c0f2ccf58379371445206469
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 20 12:12:44 2016 -0400

    fix ph issue

commit b0abbc37c6e836acf46b8703b54a0881fd499b96
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 20 11:49:12 2016 -0400

    resolve print

commit d7aa05a4ebf5866d9fe70dd3733e9e20df4fdd76
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jul 19 18:10:57 2016 -0400

    major pbbs bugfix

commit f470066edb8b7a8d8db7cef0b9a7b65f8fd8090a
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jul 19 14:31:06 2016 -0400

    fix ppbs bug

commit e1ac630d820ec2a7455392f4ddc9c4c620ea26c2
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 18 21:35:07 2016 -0400

    mod graint position

commit 0e725b855f90f63703d71a8761f717697912b65c
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 18 21:14:16 2016 -0400

    mod graint position

commit 83e0982370d9a89d4f0b0b33636511568d8eda40
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 18 16:17:40 2016 -0400

    cilk abi fixes

commit 63738d884d78c5297d1c781da81b6599e9cdeba3
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 18 13:07:38 2016 -0400

    fix recursive idx

commit 45ca520784a38bbc13b0d00597310d931c757e4b
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 18 02:25:34 2016 -0400

    fix issues with d2c extraction

commit 0e9c93c9d38a035d1ea88c2fbfbff6d6144cde0f
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun Jul 17 22:21:06 2016 -0400

    add reopt

commit ec8c23de30635cb0969514bd18068d4e2bd77ec9
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun Jul 17 22:18:39 2016 -0400

    prevent rerunning passes

commit 8d6bd63be4a6c8ebf61be02b9d2d8535de3b9484
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jul 14 13:19:44 2016 -0700

    fix asm errors

commit f83bdc1fab9bf732ea0be8b134cea617e4f85500
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jul 12 08:18:01 2016 -0700

    fix unreachable merge domtree bug

commit 662b5a7e0018b659b08dc9256dfd61f94d756f56
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jul 11 16:04:43 2016 -0400

    Resolve issues with bounds detection in loop2cilk

commit 4866c5da1c28d2c67dc168edf119cc4adfbc07f3
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jul 7 09:28:14 2016 -0400

    minor attr fix

commit 1f4c43c41f109f82859a88525a851f00b2e1b5e4
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 30 15:05:11 2016 -0400

    fix bounds error

commit 0caf3f63eb873abb93e06080eb875f0945c5c2df
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 30 14:13:54 2016 -0400

    speedup fix

commit 5cf555f901601c76bc416f7ef94dc77b375bcf84
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 30 12:41:46 2016 -0400

    resolve linker issues

commit 25e91bfc5f42f6eb1977cefe90336e85994d65d3
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 30 12:37:47 2016 -0400

    prevent l2c recursive loops

commit 325bce7bb19e0e4828e6f7eba6ba6420a1f59f7a
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 29 22:41:14 2016 -0400

    fix issue with loop parents

commit 8e0997cb4b85e14c83783d81a7e3815d64fc6056
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 29 21:10:51 2016 -0400

    more efficient loops

commit f302f9480f94a4e7f816707e5224c85e0bf07218
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 29 01:05:05 2016 -0400

    l2c computes grain size

commit 1dbd257083c5d5e95fa662cc99da0b150aed94e2
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 28 16:47:52 2016 -0400

    more error info for bad return state

commit ec4340b4cee3951abf49ad1636bff07cb77fb80f
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 27 17:57:49 2016 -0400

    fix accidental breakage

commit 88ceb1203926d59578e2c0dba02bf3b38f374120
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 27 14:39:50 2016 -0400

    fix loop2cilk indvar incr adding issue

commit 0a1cbbf7dff910f348713a88108169e03dabf3de
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Fri Jun 24 13:43:53 2016 -0400

    Better Parallel TRE

commit bc96f0b3f141176d1667b1700be945aed7520e9c
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Fri Jun 24 01:38:46 2016 -0400

    Parallel TRE

commit 579d39d8efab448cacf9c41aea8197226c64bfe4
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 23 13:47:13 2016 -0400

    more secure sync detect for loop2cilk

commit c06f49770a26c971efe66356b90a0a1ef7f2a301
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 22 16:57:07 2016 -0400

    Fix alloca issues for detached code

commit 150056edc4a2bb03c0bbe94923cfa189ce44f052
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 21 19:17:47 2016 -0400

    minor opt diff

commit 497c3b498bc8ce71ad913dff063853204810f402
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 21 15:02:58 2016 -0400

    modify pass

commit 01e49c3727f69e2da875989b4e61ab10fc058327
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 21 01:14:31 2016 -0400

    fix loop2cilk recog issue

commit 1c52cbf136f247110b7c9e4cac0a5a0d73ad63f7
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 21 00:35:03 2016 -0400

    remove pre sroa

commit 510bfacf5154f48e729c159c95c965acf4eef120
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 20 20:36:34 2016 -0400

    loop2cilk fixes to indvar

commit ef34ac80086a10e3ae04b9fd2ce4d99436eaa69e
Author: Ubuntu <ubuntu@ip-172-31-58-98.ec2.internal>
Date:   Mon Jun 20 19:00:07 2016 +0000

    Resolve linker errors

commit 4387eb25bb6e36f0e5f8d04c9d9d3f710864044a
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 20 14:47:48 2016 -0400

    Loop2cilk new indvar calculation

commit d4e44d43b5c6e40883975e87aa2c4c46759a8eb8
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 20 04:10:48 2016 -0400

    loop2cilk without opts

commit 9164742231eb140864e17562dd7e79161685e293
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 20 03:48:51 2016 -0400

    correct loop bounds calculation

commit d0d80c596491f3d8b7b9f2479f996f9345e9f059
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun Jun 19 00:43:55 2016 -0400

    clean up compile

commit 26beb619a1384b470ca0e668c1a838ee85b78b75
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Fri Jun 17 14:37:46 2016 -0400

    remove debug message

commit 76a163ddffdb916de1bee5fef34298e676266bff
Author: Ubuntu <ubuntu@ip-172-31-58-98.ec2.internal>
Date:   Wed Jun 15 20:58:36 2016 +0000

    nomem

commit 126c754b4f8e553e6b9ff33f899afaaf4182ee04
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 15 15:41:57 2016 -0400

    fixes and less print

commit cd037d2993381148f11954f51ff89c6b5e599086
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 23:33:28 2016 -0400

    restore cilkabi

commit 5964e893682feec3a63d17999d32c2125486e879
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 23:19:52 2016 -0400

    fix inline bug

commit b5a22ebc589fc25b72f513eb16ccbedc6482e9f2
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 14:32:41 2016 -0400

    cleanup dumps

commit 2ab9f07b81a7fb04c33926c2899c4af1753d6175
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 14:30:04 2016 -0400

    cleanup dumps

commit 56d8d0f052de051328c2077bcd47e75f34d9f034
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 12:35:26 2016 -0400

    cleanup dumps

commit d95ce1575159c12135952b3fa39a092bc77ad298
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 12:29:38 2016 -0400

    addl sroa fixes

commit 2754c0b40a4ca26d3201005a1d2796b840bdcce7
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 12:16:02 2016 -0400

    loop2cilk ordering issue for ind var calculation fixed

commit bebf5cc0565d9060e78a3caeb880b2ce8f43b36c
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 11:27:20 2016 -0400

    Fix SROA for detached allocas

commit 222ecb6dfd053282d450cbe9cffc7cea4d98fa5d
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Jun 14 00:36:00 2016 -0400

    minor bugfix

commit 446ad1a3bad89a44dd2c361cc0d9417a0a07eb2b
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon Jun 13 21:59:25 2016 -0400

    bugfixes

commit bc37ee11a97c23b0576d45bcc94e7a597ff30a39
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 9 10:43:21 2016 -0400

    Fix odd LICM error

commit abfc103a0f06248526972ddd6f6057e372d56383
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed Jun 8 01:04:49 2016 -0400

    parallel opt levels and fix codegen pt 1

commit cab96d82f5d94a4a6745983953f43850d3a80f7d
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Fri Jun 3 01:43:13 2016 -0400

    fix compile script

commit 6284487a349fe982d5d24d2ff45d8ff5c8d25708
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Fri Jun 3 01:41:01 2016 -0400

    fix l2c

commit 3783dfebd1a8d94ab40b958e03ffb99ac54e3f5b
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 2 23:50:39 2016 -0400

    Fix allocation issues

commit fc2042d6a1331df9a55148208d27b2c2d4834ef7
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon May 30 15:20:22 2016 -0400

    add unique block debug info

commit cd3303d769327d50bcf3a422496190ed349cbaac
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon May 30 15:17:18 2016 -0400

    fix exit block detection l2c

commit 4865203b50d0ad69531b6459a35d557908db3ffe
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Mon May 30 15:02:11 2016 -0400

    fix sync l2c detection issue

commit e95a55ae8775dfe21c0ce10e0ea32332bc3d973a
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun May 29 23:31:59 2016 -0400

    allow switch and better cmp block

commit b17417485a42308842840748c73c76953302dc30
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun May 29 22:09:34 2016 -0400

    fix issues in multiple phi nodes for l2c

commit f64fca467066650bdab351a55ec38943d360fced
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun May 29 17:29:00 2016 -0400

    add addl check for loop2cilk

commit 8d9ac096f9beda10ff400631aae3336b5cb0982e
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sat May 28 22:36:56 2016 -0400

    minor script fix

commit 748021ae6a76b9d6e2ecb85b3e247455d5e9bdb9
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sat May 28 22:24:41 2016 -0400

    lots of minor cilk error fixes

commit 0132cc1ce667fd8c21adaf5b3abd5dfadac80c09
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Wed May 25 11:52:28 2016 -0400

    fix bug in l2c about branching into

commit 9f921005730c6c92fbdf19b36714488c72c0975e
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue May 24 23:40:12 2016 -0400

    fix bug in loop2cilk

commit a9d9cd9529c20022fd5ca0600042065cfee21d8f
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun Apr 10 14:32:22 2016 -0400

    resolve block seg

commit 7410b7bcfbf610b34a0f42c0966cbdbd2e9b2e97
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sun Apr 10 13:55:01 2016 -0400

    fixes

commit 11a77b870e734e617b00e4b55f09526cf2ac37d4
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Apr 7 03:04:30 2016 -0400

    add compile

commit f2ec969a1965da3224fdffed035b9d39114d2b9a
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Apr 7 03:04:17 2016 -0400

    pre detach merging / loop unroll fixes

commit 9c00e9b80d865cf478607a4ddb90ca018ad2978c
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Apr 7 00:27:15 2016 -0400

    sync fix

commit 1f3c6dcb9d48ba519fde34c66b657571949428f7
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Apr 7 00:12:58 2016 -0400

    bug fixes

commit 0f1b1cf061ab790622c6498e0df9c5487a8d610c
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Apr 5 18:44:04 2016 -0400

    resolve delete issues

commit 86cd5870f9d667ff36b2c10971216e8f6d0977d0
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Apr 5 13:10:36 2016 -0400

    resolve delete issues

commit 06defa794acaf1f13ecdd63d57b38a49e2561492
Merge: 2f7e6ec4fa6 8b47c17a53d
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Tue Apr 5 11:57:10 2016 -0400

    Merge remote-tracking branch 'llvm/release_38'

commit 8b47c17a53d683f313eaaa93c4a53de26d8fcba5
Author: Dimitry Andric <dimitry@andric.com>
Date:   Tue Apr 5 06:58:21 2016 +0000

    Merging r264335:
    ------------------------------------------------------------------------
    r264335 | dim | 2016-03-24 21:39:17 +0100 (Thu, 24 Mar 2016) | 17 lines

    Add <atomic> to ThreadPool.h, since std::atomic is used

    Summary:
    Apparently, when compiling with gcc 5.3.2 for powerpc64, the order of
    headers is such that it gets an error about std::atomic<> use in
    ThreadPool.h, since this header is not included explicitly.  See also:

    https://llvm.org/bugs/show_bug.cgi?id=27058

    Fix this by including <atomic>.  Patch by Bryan Drewery.

    Reviewers: chandlerc, joker.eph

    Subscribers: bdrewery, llvm-commits

    Differential Revision: http://reviews.llvm.org/D18460

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@265380 91177308-0d34-0410-b5e6-96231b3b80d8

commit 295c7a62d88d363361198766ce95900441727da9
Author: Renato Golin <renato.golin@linaro.org>
Date:   Sat Apr 2 20:36:55 2016 +0000

    Merging r263714: ARM: Revert SVN r253865, 254158, fix windows division

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@265245 91177308-0d34-0410-b5e6-96231b3b80d8

commit 2a2d901e3c55aff48990de5e415c429c4cfeb6d8
Author: Renato Golin <renato.golin@linaro.org>
Date:   Sat Apr 2 20:32:54 2016 +0000

    Merging r263123: ARM: follow up improvements for SVN r263118

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@265244 91177308-0d34-0410-b5e6-96231b3b80d8

commit 97a35e605ab417f11be4ccb532fcc9015ebb2ca8
Author: Renato Golin <renato.golin@linaro.org>
Date:   Sat Apr 2 20:31:15 2016 +0000

    Merging r263118: ARM: correct __builtin_longjmp on WoA

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@265243 91177308-0d34-0410-b5e6-96231b3b80d8

commit dec3a22cf5b8f8e6c6d1bf898f3a14bc4c54e0b4
Author: Tom Stellard <thomas.stellard@amd.com>
Date:   Mon Mar 28 18:13:48 2016 +0000

    Bump version to 3.8.1

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@264605 91177308-0d34-0410-b5e6-96231b3b80d8

commit 2f7e6ec4fa663dff11ba3dff5f74468e79c042d9
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 17 08:15:50 2016 +0000

    Cleaning up CilkABI.

commit 88a51fc0886146600e14173a0878b6567b29e3bc
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 17 08:15:05 2016 +0000

    Fixing Loop2Cilk CMakeLists entries to fix cmake build.

commit 0d0d243f395a4192bf4d85817c8ac14f5d9d8b2f
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 17 08:14:16 2016 +0000

    Fixing Loop2Cilk for merge with 'release_38'

commit 277ca2c63350507bf3ba5cd075f204e4b356fc5f
Merge: 008aa9d2441 ad5750369cc
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Mar 17 08:09:16 2016 +0000

    Merge branch 'release_38' of http://llvm.org/git/llvm into tb-scratch

commit 008aa9d24417420734027b5072ea48cc86b428d2
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Sat Mar 12 17:32:11 2016 -0500

    loop2cilk working happily

commit ea5e316db15804df27dcfaf6b790f07c8e7bd2b2
Merge: 9b3fc2538fd 1526147c0ad
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Mar 10 13:16:18 2016 -0500

    Merge branch 'tb-scratch' of ssh://github.com/taekwonbilly/Parallel-IR into tb-scratch

commit 9b3fc2538fdd9218bcb1a91b954028652579c6e4
Author: William S. Moses <taekwonbilly@gmail.com>
Date:   Thu Mar 10 13:15:45 2016 -0500

    loop2cilk mods

commit ad5750369cc5b19f36c149f7b13151c99c7be47a
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Mar 2 23:38:03 2016 +0000

    ReleaseNotes: tidy up

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@262542 91177308-0d34-0410-b5e6-96231b3b80d8

commit 0805780408c97128dc9164d4dbb8604882f5588e
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Mar 2 23:10:55 2016 +0000

    Remove 'if you are using a released version' warning

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@262537 91177308-0d34-0410-b5e6-96231b3b80d8

commit f26161e8b05360841a1a3a4a2204ed761d6a2e04
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Mar 2 18:19:22 2016 +0000

    ReleaseNotes: C API policy; by Eric Christopher

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@262496 91177308-0d34-0410-b5e6-96231b3b80d8

commit 27c964e2ae0b573cf1e6551a3da255539db03d3c
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 26 21:37:52 2016 +0000

    ReleaseNotes: PowerPC; by Kit Barton

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@262074 91177308-0d34-0410-b5e6-96231b3b80d8

commit bb6f14e3581c78509405a3d415e72821db8a2066
Author: Quentin Colombet <qcolombet@apple.com>
Date:   Mon Feb 22 22:27:47 2016 +0000

    [AArch64] Fix bug in prolog clobbering live reg when shrink wrapping.

    This adapts r261349 to the release branch.

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261586 91177308-0d34-0410-b5e6-96231b3b80d8

commit e970b795a27d16c720bf4e3ff030eea241784eb4
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 22 21:05:14 2016 +0000

    Merging r261441, r261447, and r261546:

    ------------------------------------------------------------------------
    r261441 | nemanjai | 2016-02-20 10:16:25 -0800 (Sat, 20 Feb 2016) | 12 lines

    Fix for PR 26500

    This patch corresponds to review:
    http://reviews.llvm.org/D17294

    It ensures that whatever block we are emitting the prologue/epilogue into, we
    have the necessary scratch registers. It takes away the hard-coded register
    numbers for use as scratch registers as registers that are guaranteed to be
    available in the function prologue/epilogue are not guaranteed to be available
    within the function body. Since we shrink-wrap, the prologue/epilogue may end
    up in the function body.
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r261447 | nemanjai | 2016-02-20 12:45:37 -0800 (Sat, 20 Feb 2016) | 6 lines

    Fix the build bot break caused by rL261441.

    The patch has a necessary call to a function inside an assert. Which is fine
    when you have asserts turned on. Not so much when they're off. Sorry about
    the regression.
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r261546 | nemanjai | 2016-02-22 10:04:00 -0800 (Mon, 22 Feb 2016) | 6 lines

    Fix for PR26690 take 2

    This is what was meant to be in the initial commit to fix this bug. The
    parens were missing. This commit also adds a test case for the bug and
    has undergone full testing on PPC and X86.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261572 91177308-0d34-0410-b5e6-96231b3b80d8

commit f65e46be097186d748836d42c38a6dc7f30e6c3b
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 22 17:51:28 2016 +0000

    Merging r261387:
    ------------------------------------------------------------------------
    r261387 | davide | 2016-02-19 16:44:47 -0800 (Fri, 19 Feb 2016) | 8 lines

    [X86ISelLowering] Fix TLSADDR lowering when shrink-wrapping is enabled.

    TLSADDR nodes are lowered into actuall calls inside MC. In order to prevent
    shrink-wrapping from pushing prologue/epilogue past them (which result
    in TLS variables being accessed before the stack frame is set up), we
    put markers, so that the stack gets adjusted properly.
    Thanks to Quentin Colombet for guidance/help on how to fix this problem!

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261542 91177308-0d34-0410-b5e6-96231b3b80d8

commit e3b2bd1e79c9c9d24490b6ddb2341afcf4210691
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 22 17:47:10 2016 +0000

    Merging r261384:
    ------------------------------------------------------------------------
    r261384 | qcolombet | 2016-02-19 16:32:29 -0800 (Fri, 19 Feb 2016) | 4 lines

    [RegAllocFast] Properly track the physical register definitions on calls.

    PR26485

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261539 91177308-0d34-0410-b5e6-96231b3b80d8

commit c63a0fe41b81bac1ea6e1a053d2a8939e02edf17
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 19 21:42:57 2016 +0000

    Merging r261368:
    ------------------------------------------------------------------------
    r261368 | hans | 2016-02-19 13:40:12 -0800 (Fri, 19 Feb 2016) | 3 lines

    Revert r255691 "[LoopVectorizer] Refine loop vectorizer's register usage calculator by ignoring specific instructions."

    It caused PR26509.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261369 91177308-0d34-0410-b5e6-96231b3b80d8

commit 78e9cd40a2ea27cc9300d900a7dccc75940f9eb0
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 19 21:35:00 2016 +0000

    Merging r261360:
    ------------------------------------------------------------------------
    r261360 | dim | 2016-02-19 12:14:11 -0800 (Fri, 19 Feb 2016) | 19 lines

    Fix incorrect selection of AVX512 sqrt when OptForSize is on

    Summary:
    When optimizing for size, sqrt calls can be incorrectly selected as
    AVX512 VSQRT instructions.  This is because X86InstrAVX512.td has a
    `Requires<[OptForSize]>` in its `avx512_sqrt_scalar` multiclass
    definition.  Even if the target does not support AVX512, the class can
    apparently still be chosen, leading to an incorrect selection of
    `vsqrtss`.

    In PR26625, this lead to an assertion: Reg >= X86::FP0 && Reg <=
    X86::FP6 && "Expected FP register!", because the `vsqrtss` instruction
    requires an XMM register, which is not available on i686 CPUs.

    Reviewers: grosbach, resistor, joker.eph

    Subscribers: spatel, emaste, llvm-commits

    Differential Revision: http://reviews.llvm.org/D17414
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261367 91177308-0d34-0410-b5e6-96231b3b80d8

commit fdf40bea4fc416643210790fff4345be98d97245
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 19 21:28:08 2016 +0000

    Merging r261365:
    ------------------------------------------------------------------------
    r261365 | hans | 2016-02-19 13:26:31 -0800 (Fri, 19 Feb 2016) | 3 lines

    Revert r253557 "Alternative to long nops for X86 CPUs, by Andrey Turetsky"

    Turns out the new nop sequences aren't actually nops on x86_64 (PR26554).
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261366 91177308-0d34-0410-b5e6-96231b3b80d8

commit 413ee9f101de92d75fc11334ffeb6a054d67a18c
Author: Renato Golin <renato.golin@linaro.org>
Date:   Fri Feb 19 17:35:27 2016 +0000

    Merge r261331: avoid out of bounds loads for interleaved access vectorization

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261341 91177308-0d34-0410-b5e6-96231b3b80d8

commit 124d2bc4dc3298d2b669be23a5b640d985319b65
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 19 17:13:16 2016 +0000

    Merging r261306:
    ------------------------------------------------------------------------
    r261306 | matze | 2016-02-18 20:44:19 -0800 (Thu, 18 Feb 2016) | 1 line

    LegalizeDAG: Fix ExpandFCOPYSIGN assuming the same type on both inputs
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261334 91177308-0d34-0410-b5e6-96231b3b80d8

commit 6f28d52e9d3f87875732a0f2c1f3b03ef56be2db
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 19 00:08:56 2016 +0000

    Merging r261258:
    ------------------------------------------------------------------------
    r261258 | rnk | 2016-02-18 12:57:41 -0800 (Thu, 18 Feb 2016) | 14 lines

    [IR] Straighten out bundle overload of IRBuilder::CreateCall

    IRBuilder has two ways of putting bundle operands on calls: the default
    operand bundle, and an overload of CreateCall that takes an operand
    bundle list.

    Previously, this overload used a default argument of None. This made it
    impossible to distinguish between the case were the caller doesn't care
    about bundles, and the case where the caller explicitly wants no
    bundles. We behaved as if they wanted the latter behavior rather than
    the former, which led to problems with simplifylibcalls and WinEH.

    This change fixes it by making the parameter non-optional, so we can
    distinguish these two cases.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261282 91177308-0d34-0410-b5e6-96231b3b80d8

commit 6e961aa243f223ddb704ce708056238d7c1d7e24
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 17 19:00:40 2016 +0000

    Merging r261039:
    ------------------------------------------------------------------------
    r261039 | rnk | 2016-02-16 16:17:33 -0800 (Tue, 16 Feb 2016) | 6 lines

    [X86] Fix a shrink-wrapping miscompile around __chkstk

    __chkstk clobbers EAX. If EAX is live across the prologue, then we have
    to take extra steps to save it. We already had code to do this if EAX
    was a register parameter. This change adapts it to work when shrink
    wrapping is used.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261136 91177308-0d34-0410-b5e6-96231b3b80d8

commit ebe537a930b58a5d32fc41ac133309139c92f7bd
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:49:28 2016 +0000

    Merging r258616:
    ------------------------------------------------------------------------
    r258616 | majnemer | 2016-01-22 22:00:44 -0800 (Fri, 22 Jan 2016) | 3 lines

    [PruneEH] Don't try to insert a terminator after another terminator

    LLVM's BasicBlock has a single terminator, it is not valid to have two.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261132 91177308-0d34-0410-b5e6-96231b3b80d8

commit 9f25a0678ed9f06088a09649a040a6bef362e6af
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:49:09 2016 +0000

    Merging r258611:
    ------------------------------------------------------------------------
    r258611 | majnemer | 2016-01-22 21:41:29 -0800 (Fri, 22 Jan 2016) | 6 lines

    [PruneEH] FuncletPads must not have undef operands

    Instead of RAUW with undef, replace the first non-token instruction with
    unreachable.

    This fixes PR26263.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261131 91177308-0d34-0410-b5e6-96231b3b80d8

commit 4212ebff28e32dbd26bd93f4fa77190d80357ed4
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:48:45 2016 +0000

    Merging r258610:
    ------------------------------------------------------------------------
    r258610 | majnemer | 2016-01-22 21:41:27 -0800 (Fri, 22 Jan 2016) | 3 lines

    [PruneEH] Unify invoke and call handling in DeleteBasicBlock

    No functionality change is intended.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261130 91177308-0d34-0410-b5e6-96231b3b80d8

commit ba95fe05372c1934c30e21747480d401c1e5bcec
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:48:28 2016 +0000

    Merging r258609:
    ------------------------------------------------------------------------
    r258609 | majnemer | 2016-01-22 21:41:22 -0800 (Fri, 22 Jan 2016) | 5 lines

    [PruneEH] Reuse code from removeUnwindEdge

    PruneEH had functionality idential to removeUnwindEdge.
    Consolidate around removeUnwindEdge.
    No functionality change is intended.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261129 91177308-0d34-0410-b5e6-96231b3b80d8

commit 77c8a562e0c7c47df3bb988e2d230df6a9dcbe1d
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:42:17 2016 +0000

    Merging r259702:
    ------------------------------------------------------------------------
    r259702 | majnemer | 2016-02-03 13:30:34 -0800 (Wed, 03 Feb 2016) | 7 lines

    [LoopStrengthReduce] Don't rewrite PHIs with incoming values from CatchSwitches

    Bail out if we have a PHI on an EHPad that gets a value from a
    CatchSwitchInst.  Because the CatchSwitchInst cannot be split, there is
    no good place to stick any instructions.

    This fixes PR26373.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261126 91177308-0d34-0410-b5e6-96231b3b80d8

commit c75c50f45b3d6d1d61ce6b411d12cedaadd71d5b
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:41:44 2016 +0000

    Merging r260164:
    ------------------------------------------------------------------------
    r260164 | akaylor | 2016-02-08 14:52:51 -0800 (Mon, 08 Feb 2016) | 5 lines

    [regalloc][WinEH] Do not mark intervals as not spillable if they contain a regmask

    Differential Revision: http://reviews.llvm.org/D16831

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261125 91177308-0d34-0410-b5e6-96231b3b80d8

commit fde3338c42eb085f169ecc3817c4736075e4a683
Author: David Majnemer <david.majnemer@gmail.com>
Date:   Wed Feb 17 18:41:08 2016 +0000

    Merging r260733:
    ------------------------------------------------------------------------
    r260733 | akaylor | 2016-02-12 13:10:16 -0800 (Fri, 12 Feb 2016) | 5 lines

    [WinEH] Prevent EH state numbering from skipping nested cleanup pads that never return

    Differential Revision: http://reviews.llvm.org/D17208

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261124 91177308-0d34-0410-b5e6-96231b3b80d8

commit 2507c58ca21ee01c359cd5ddf2fe84eea16366ee
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 17 17:57:26 2016 +0000

    ReleaseNotes: new Win EH instructions; by David Majnemer

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261116 91177308-0d34-0410-b5e6-96231b3b80d8

commit d77e9352a80c954cf91335c236224e4ca7d9c5f4
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 17 16:40:51 2016 +0000

    Merging r261033:
    ------------------------------------------------------------------------
    r261033 | akaylor | 2016-02-16 15:52:18 -0800 (Tue, 16 Feb 2016) | 5 lines

    Fix build LLVM with -D LLVM_USE_INTEL_JITEVENTS:BOOL=ON on Windows

    Differential Revision: http://reviews.llvm.org/D16940

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261106 91177308-0d34-0410-b5e6-96231b3b80d8

commit 7609bf251117db67abfe0d5b6622860afc769278
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 17 00:05:18 2016 +0000

    ReleaseNotes: -femultated-tls; by Chih-hung Hsieh

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261035 91177308-0d34-0410-b5e6-96231b3b80d8

commit 07fd930a2be55b0789737cd9769f0d0e42def3a7
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 23:22:17 2016 +0000

    Merging r260390:
    ------------------------------------------------------------------------
    r260390 | jyknight | 2016-02-10 09:47:20 -0800 (Wed, 10 Feb 2016) | 12 lines

    [SPARC] Repair floating-point condition encodings in assembly parser.

    The encodings for floating point conditions A(lways) and N(ever) were
    incorrectly specified for the assembly parser, per Sparc manual v8 page
    121. This change corrects that mistake.

    Also, strangely, all of the branch instructions already had MC test
    cases, except for the broken ones. Added the tests.

    Patch by Chris Dewhurst

    Differential Revision: http://reviews.llvm.org/D17074
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261031 91177308-0d34-0410-b5e6-96231b3b80d8

commit b7b1a389f7d546dbe6a67aa3bb0e66f689e99c1b
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 21:46:52 2016 +0000

    Merging r258103:
    ------------------------------------------------------------------------
    r258103 | kli | 2016-01-18 16:04:41 -0800 (Mon, 18 Jan 2016) | 2 lines

    parseArch() supports more variations of arch names for PowerPC builds

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261015 91177308-0d34-0410-b5e6-96231b3b80d8

commit fff361d60b64ac8ee9fcb523872aa7beea8ab8e1
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:37:14 2016 +0000

    ReleaseNotes: shrink-wrapping; by Quentin Colombet

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261001 91177308-0d34-0410-b5e6-96231b3b80d8

commit b129a10bb92529289bbb26d2335b12858e54a885
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:29:54 2016 +0000

    ReleaseNotes: typo

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@261000 91177308-0d34-0410-b5e6-96231b3b80d8

commit d3b1222c56e9214e49a3d829e8e60910f8c88903
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:27:50 2016 +0000

    ReleaseNotes: Hexagon; by Krzysztof Parzyszek

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260999 91177308-0d34-0410-b5e6-96231b3b80d8

commit f1aaed61455e48b6c7444f706a6f997a864a42fa
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:20:40 2016 +0000

    Merging r257864 and r258112:

    ------------------------------------------------------------------------
    r257864 | axw | 2016-01-14 19:33:35 -0800 (Thu, 14 Jan 2016) | 12 lines

    [docs] Document LLVM_{BUILD,LINK}_LLVM_DYLIB

    Summary:
    Document the LLVM_BUILD_LLVM_DYLIB and LLVM_LINK_LLVM_DYLIB
    CMake options, move BUILD_SHARED_LIBS out of frequently-used,
    and add a note/warning to BUILD_SHARED_LIBS.

    Reviewers: beanz, delcypher, mjacob

    Subscribers: mjacob, llvm-commits

    Differential Revision: http://reviews.llvm.org/D16208
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r258112 | axw | 2016-01-18 21:43:21 -0800 (Mon, 18 Jan 2016) | 8 lines

    docs: address post-commit review

    Rewording/expansion of CMake options
    suggested by Dan Liew.

    See http://reviews.llvm.org/D16208.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260996 91177308-0d34-0410-b5e6-96231b3b80d8

commit 80cc2ce6475352a29e19824443c2e0a31a37b44d
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:19:03 2016 +0000

    ReleaseNotes: -DLLVM_LINK_LLVM_DYLIB=ON; by Andrew Wilkins

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260995 91177308-0d34-0410-b5e6-96231b3b80d8

commit 1e466cf4f8098acc7025f8d71dd0f64c4754ed63
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 19:07:38 2016 +0000

    ReleaseNotes: ORC in Kaleidoscope and C bindings; by Lang Hames

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260991 91177308-0d34-0410-b5e6-96231b3b80d8

commit b508a338d9d922a1ec3fbef698bd9fc6b5217ae0
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 16 17:38:25 2016 +0000

    ReleaseNotes: fix typo, reported by Eugene

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260985 91177308-0d34-0410-b5e6-96231b3b80d8

commit 4f229233ffc588a35e3738d3c358f2cf7a5da1d1
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 19:03:12 2016 +0000

    Merging r260703:
    ------------------------------------------------------------------------
    r260703 | hans | 2016-02-12 11:02:39 -0800 (Fri, 12 Feb 2016) | 11 lines

    [CMake] don't build libLTO when LLVM_ENABLE_PIC is OFF

    When cmake is run with -DLLVM_ENABLE_PIC=OFF, build fails while
    linking shared library libLTO.so, because its dependencies are built
    with -fno-PIC. More details here: https://llvm.org/bugs/show_bug.cgi?id=26484.
    This diff reverts r252652 (git 9fd4377ddb83aee3c049dc8757e7771edbb8ee71),
    which removed check NOT LLVM_ENABLE_PIC before disabling build for libLTO.so.

    Patch by Igor Sugak!

    Differential Revision: http://reviews.llvm.org/D17049
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260704 91177308-0d34-0410-b5e6-96231b3b80d8

commit 7e2ddb94a31d1d085b0228e374799566faa82b8e
Author: Peter Collingbourne <peter@pcc.me.uk>
Date:   Fri Feb 12 18:46:48 2016 +0000

    ARM: Mention r251322 in release notes.

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260702 91177308-0d34-0410-b5e6-96231b3b80d8

commit 347f4e82e80af64eca192381112ff6e9e3c7c8c3
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 17:52:29 2016 +0000

    Merging r260641:
    ------------------------------------------------------------------------
    r260641 | axw | 2016-02-11 17:42:43 -0800 (Thu, 11 Feb 2016) | 10 lines

    Avoid linking LLVM component libraries with libLLVM

    Patch by Jack Howarth.

    When linking to libLLVM, don't also link to the component
    libraries that constitute libLLVM.

    Differential Revision: http://reviews.llvm.org/D16945

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260693 91177308-0d34-0410-b5e6-96231b3b80d8

commit e469b8a4f8daa8d29fe1d1f8ed87b36114dd5726
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 16:18:07 2016 +0000

    Merging r260427:
    ------------------------------------------------------------------------
    r260427 | nha | 2016-02-10 12:13:58 -0800 (Wed, 10 Feb 2016) | 16 lines

    AMDGPU: Release the scavenged offset register during VGPR spill

    Summary:
    This fixes a crash where subsequent spills would be unable to scavenge
    a register. In particular, it fixes a crash in piglit's
    spec@glsl-1.50@execution@geometry@max-input-components (the test still
    has a shader that fails to compile because of too many SGPR spills, but
    at least it doesn't crash any more).

    This is a candidate for the release branch.

    Reviewers: arsenm, tstellarAMD

    Subscribers: qcolombet, arsenm

    Differential Revision: http://reviews.llvm.org/D16558
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260687 91177308-0d34-0410-b5e6-96231b3b80d8

commit ec95d6fe25dcb8b1450c4440da7c7a7e2982b6f2
Author: Renato Golin <renato.golin@linaro.org>
Date:   Fri Feb 12 15:29:34 2016 +0000

    [ARM/AArch64] 3.8.0 release notes changes

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260684 91177308-0d34-0410-b5e6-96231b3b80d8

commit 10a5589d08c1de3fcd715ce23697d4e591519595
Author: Dylan McKay <dylanmckay34@gmail.com>
Date:   Fri Feb 12 06:38:02 2016 +0000

    [AVR] Add release notes for 3.8

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260659 91177308-0d34-0410-b5e6-96231b3b80d8

commit 12009f63c5d16b98334930a2b97d279c6bf82ea0
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 02:32:24 2016 +0000

    ReleaseNotes: oh, there already was a section about X86

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260650 91177308-0d34-0410-b5e6-96231b3b80d8

commit fb52ed812c40eb8c6f1f69575bb231b62b319a95
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 02:29:33 2016 +0000

    ReleaseNotes: start off a 'Changes to X86' section

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260648 91177308-0d34-0410-b5e6-96231b3b80d8

commit e293d6c8d134ad352bb69defee17c5c902476933
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 01:56:35 2016 +0000

    Release Notes: RegisterScheduler::setDefault removed; by Mehdi Amini

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260643 91177308-0d34-0410-b5e6-96231b3b80d8

commit 7a0ec464f16e761602ac9c4e1f610029c0346745
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 01:42:38 2016 +0000

    Merging r260587:
    ------------------------------------------------------------------------
    r260587 | pete | 2016-02-11 13:10:40 -0800 (Thu, 11 Feb 2016) | 13 lines

    Set load alignment on aggregate loads.

    When optimizing a extractvalue(load), we generate a load from the
    aggregate type.  This load didn't have alignment set and so would
    get the alignment of the type.  This breaks when the type is packed
    and so the alignment should be lower.

    For example, loading { int, int } would give us alignment of 4, but
    the original load from this type may have an alignment of 1 if packed.

    Reviewed by David Majnemer

    Differential revision: http://reviews.llvm.org/D17158
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260640 91177308-0d34-0410-b5e6-96231b3b80d8

commit 73a8ae3c0f127d45e391bd8b40be51c2fbc15dd8
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 12 00:45:55 2016 +0000

    ReleaseNotes: drop in-progress warning and svn checkout note

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260634 91177308-0d34-0410-b5e6-96231b3b80d8

commit 2ec5a319cacb9e13bf20bc8b9113d11212f10aae
Author: Kai Nacke <kai.nacke@redstar.de>
Date:   Thu Feb 11 20:42:16 2016 +0000

    Add LDC compiler to list of external OS projects using LLVM 3.8

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260584 91177308-0d34-0410-b5e6-96231b3b80d8

commit 6ca6b8a0c8560555aed16b880f1499a5a0b4deda
Author: Duncan P. N. Exon Smith <dexonsmith@apple.com>
Date:   Wed Feb 10 19:20:23 2016 +0000

    ReleaseNotes: Document changes to ilist API

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260415 91177308-0d34-0410-b5e6-96231b3b80d8

commit 185bb1287f864701d9b19eef89e7838162e7c793
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 8 22:15:55 2016 +0000

    Merging r259958:
    ------------------------------------------------------------------------
    r259958 | evandro | 2016-02-05 16:01:41 -0800 (Fri, 05 Feb 2016) | 11 lines

    [AArch64] Add the scheduling model for Exynos-M1

    Summary:
    Add the core scheduling model for the Samsung Exynos-M1 (ARMv8-A).

    Reviewers: jmolloy, rengolin, christof, MinSeongKIM, t.p.northover

    Subscribers: aemerson, rengolin, MatzeB

    Differential Revision: http://reviews.llvm.org/D16644
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260156 91177308-0d34-0410-b5e6-96231b3b80d8

commit 777479f80202057f041683129d4fd9e574ffea79
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 8 18:31:49 2016 +0000

    Merging r259696:
    ------------------------------------------------------------------------
    r259696 | kfischer | 2016-02-03 13:13:33 -0800 (Wed, 03 Feb 2016) | 12 lines

    [DWARFDebug] Fix another case of overlapping ranges

    Summary:
    In r257979, I added code to ensure that we wouldn't merge DebugLocEntries if
    the pieces they describe overlap. Unfortunately, I failed to cover the case,
    where there may have multiple active Expressions in the entry, in which case we
    need to make sure that no two values overlap before we can perform the merge.

    This fixed PR26148.

    Reviewers: aprantl
    Differential Revision: http://reviews.llvm.org/D16742
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260121 91177308-0d34-0410-b5e6-96231b3b80d8

commit 7ecd92d75cda45668b6b5fdbcdd2142826514e66
Author: Daniel Sanders <daniel.sanders@imgtec.com>
Date:   Mon Feb 8 14:14:18 2016 +0000

    [mips] Add initial release notes for MIPS32.

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@260095 91177308-0d34-0410-b5e6-96231b3b80d8

commit ff65de018b6bb5bc4da3e923bbc0f55c5ca8e039
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 5 22:17:38 2016 +0000

    Merging r259381:
    ------------------------------------------------------------------------
    r259381 | uweigand | 2016-02-01 10:31:19 -0800 (Mon, 01 Feb 2016) | 21 lines

    [SystemZ] Fix wrong-code generation for certain always-false conditions

    We've found another bug in the code generation logic conditions for a
    certain class of always-false conditions, those of the form
       if ((a & 1) < 0)

    These only reach the back end when compiling without optimization.

    The bug was introduced by the choice of using TEST UNDER MASK
    to implement a check for
       if ((a & MASK) < VAL)
    as
       if ((a & MASK) == 0)

    where VAL is less than the the lowest bit of MASK.  This is correct
    in all cases except for VAL == 0, in which case the original
    condition is always false, but the replacement isn't.

    Fixed by excluding that particular case.

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259940 91177308-0d34-0410-b5e6-96231b3b80d8

commit 56d368f5a52e60fa29891a6647034fffbba8713b
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 5 16:30:31 2016 +0000

    Merging r259886 and r259888:

    ------------------------------------------------------------------------
    r259886 | nemanjai | 2016-02-05 06:50:29 -0800 (Fri, 05 Feb 2016) | 5 lines

    Fix for PR 26193

    This is a simple fix for a PowerPC intrinsic that was incorrectly defined
    (the return type was incorrect).
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r259888 | nemanjai | 2016-02-05 07:03:17 -0800 (Fri, 05 Feb 2016) | 3 lines

    Add the missing test case for PR26193
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259891 91177308-0d34-0410-b5e6-96231b3b80d8

commit 9be4dc8ab20a009ed5f24610888421ba84f8ec65
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 5 00:55:39 2016 +0000

    Merging r259840 on top of r259178:

    ------------------------------------------------------------------------
    r259178 | echristo | 2016-01-28 23:20:30 -0800 (Thu, 28 Jan 2016) | 1 line

    Refactor common code for PPC fast isel load immediate selection.
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r259840 | nemanjai | 2016-02-04 15:14:42 -0800 (Thu, 04 Feb 2016) | 7 lines

    Fix for PR 26356

    Using the load immediate only when the immediate (whether signed or unsigned)
    can fit in a 16-bit signed field. Namely, from -32768 to 32767 for signed and
    0 to 65535 for unsigned. This patch also ensures that we sign-extend under the
    right conditions.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259858 91177308-0d34-0410-b5e6-96231b3b80d8

commit 12d60e9e7c149a7d333e277dfbe25a720c88c585
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Feb 5 00:46:12 2016 +0000

    Merging r259798, r259835:

    ------------------------------------------------------------------------
    r259798 | nemanjai | 2016-02-04 08:18:08 -0800 (Thu, 04 Feb 2016) | 9 lines

    Enable the %s modifier in inline asm template string

    This patch corresponds to review:
    http://reviews.llvm.org/D16847

    There are some files in glibc that use the output operand modifier even though
    it was deprecated in GCC. This patch just adds support for it to prevent issues
    with such files.
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r259835 | nemanjai | 2016-02-04 14:36:10 -0800 (Thu, 04 Feb 2016) | 3 lines

    Provide a test case for rl259798
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259856 91177308-0d34-0410-b5e6-96231b3b80d8

commit 78a7d49140626994c23367b709e7b30b41e5cf70
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Feb 4 16:59:45 2016 +0000

    Merging r259695:
    ------------------------------------------------------------------------
    r259695 | tfiala | 2016-02-03 13:13:23 -0800 (Wed, 03 Feb 2016) | 11 lines

    Address NDEBUG-related linkage issues for Value::assertModuleIsMaterialized()

    The IR/Value class had a linkage issue present when LLVM was built
    as a library, and the LLVM library build time had different settings
    for NDEBUG than the client of the LLVM library.  Clients could get
    into a state where the LLVM lib expected
    Value::assertModuleIsMaterialized() to be inline-defined in the header
    but clients expected that method to be defined in the LLVM library.

    See this llvm-commits thread for more details:
    http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20160201/329667.html
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259801 91177308-0d34-0410-b5e6-96231b3b80d8

commit 19b86f670bb5005761ecdcbe41423fee7fd200cf
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Feb 4 02:16:36 2016 +0000

    Merging r259740:
    ------------------------------------------------------------------------
    r259740 | nemanjai | 2016-02-03 17:58:20 -0800 (Wed, 03 Feb 2016) | 2 lines

    Test case for PR 26381

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259743 91177308-0d34-0410-b5e6-96231b3b80d8

commit 0a7ec6ced609c340fc4028aa8a65996623dd4181
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 3 22:00:13 2016 +0000

    Merging r259177:
    ------------------------------------------------------------------------
    r259177 | echristo | 2016-01-28 23:20:01 -0800 (Thu, 28 Jan 2016) | 5 lines

    Since LI/LIS sign extend the constant passed into the instruction we should
    check that the sign extended constant fits into 16-bits if we want a
    zero extended value, otherwise go ahead and put it together piecemeal.

    Fixes PR26356.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259713 91177308-0d34-0410-b5e6-96231b3b80d8

commit 6b78a48f5c068df653f1c12d2ad7832aaa45c7a1
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 3 21:24:31 2016 +0000

    Merging r259649:
    ------------------------------------------------------------------------
    r259649 | jamesm | 2016-02-03 07:05:06 -0800 (Wed, 03 Feb 2016) | 11 lines

    [DemandedBits] Revert r249687 due to PR26071

    This regresses a test in LoopVectorize, so I'll need to go away and think about how to solve this in a way that isn't broken.

    From the writeup in PR26071:

    What's happening is that ComputeKnownZeroes is telling us that all bits except the LSB are zero. We're then deciding that only the LSB needs to be demanded from the icmp's inputs.

    This is where we're wrong - we're assuming that after simplification the bits that were known zero will continue to be known zero. But they're not - during trivialization the upper bits get changed (because an XOR isn't shrunk), so the icmp fails.

    The fault is in demandedbits - its contract does clearly state that a non-demanded bit may either be zero or one.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259699 91177308-0d34-0410-b5e6-96231b3b80d8

commit 18a86c95fc36b5f622e8dc87f71252de37a1ed44
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Feb 3 21:18:35 2016 +0000

    Merging r259645:
    ------------------------------------------------------------------------
    r259645 | nemanjai | 2016-02-03 04:53:38 -0800 (Wed, 03 Feb 2016) | 4 lines

    Fix for PR 26381

    Simple fix - Constant values were not being sign extended in FastIsel.

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259698 91177308-0d34-0410-b5e6-96231b3b80d8

commit 1bfe978e5d0ac77f381b0ccef78204f7f3593a01
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 2 17:41:39 2016 +0000

    Merging r259346 (with adjustments for r258867):

    ------------------------------------------------------------------------
    r259346 | ibreger | 2016-02-01 01:57:15 -0800 (Mon, 01 Feb 2016) | 3 lines

    AVX512: fix mask handling for gather/scatter/prefetch intrinsics.

    Differential Revision: http://reviews.llvm.org/D16755
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259536 91177308-0d34-0410-b5e6-96231b3b80d8

commit f24a5b58cd7ecc4fada221308073b9f13672d6c0
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Feb 2 17:35:07 2016 +0000

    Merging r259342 (with s/p2align 4/align 16) because r258750 is not in 3.8.

    ------------------------------------------------------------------------
    r259342 | ibreger | 2016-01-31 23:56:09 -0800 (Sun, 31 Jan 2016) | 3 lines

    AVX512 : Fix SETCCE lowering for KNL 32 bit.

    Differential Revision: http://reviews.llvm.org/D16752
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259533 91177308-0d34-0410-b5e6-96231b3b80d8

commit 5ea3635939d3e30182cd5a9881447890c8b69c42
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Feb 1 19:18:10 2016 +0000

    Merging r259375:
    ------------------------------------------------------------------------
    r259375 | majnemer | 2016-02-01 09:37:56 -0800 (Mon, 01 Feb 2016) | 6 lines

    [InstCombine] Don't transform (X+INT_MAX)>=(Y+INT_MAX) -> (X<=Y)

    This miscompile came about because we tried to use a transform which was
    only appropriate for xor operators when addition was present.

    This fixes PR26407.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259390 91177308-0d34-0410-b5e6-96231b3b80d8

commit aad888f28ee3e920b6e1a3828398f6c9c256f3d3
Author: Tim Northover <tnorthover@apple.com>
Date:   Fri Jan 29 22:00:06 2016 +0000

    Merging r259228:
    ------------------------------------------------------------------------
    r259228 | tnorthover | 2016-01-29 11:18:46 -0800 (Fri, 29 Jan 2016) | 13 lines

    ARM: don't mangle DAG constant if it has more than one use

    The basic optimisation was to convert (mul $LHS, $complex_constant) into
    roughly "(shl (mul $LHS, $simple_constant), $simple_amt)" when it was expected
    to be cheaper. The original logic checks that the mul only has one use (since
    we're mangling $complex_constant), but when used in even more complex
    addressing modes there may be an outer addition that can pick up the wrong
    value too.

    I *think* the ARM addressing-mode problem is actually unreachable at the
    moment, but that depends on complex assessments of the profitability of
    pre-increment addressing modes so I've put a real check in there instead of an
    assertion.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259247 91177308-0d34-0410-b5e6-96231b3b80d8

commit 5ad5d2c5359a4e878c732db59ee7fc6e0a25dc00
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Jan 29 21:33:02 2016 +0000

    Merging r259236:
    ------------------------------------------------------------------------
    r259236 | spatel | 2016-01-29 12:21:02 -0800 (Fri, 29 Jan 2016) | 8 lines

    [InstCombine] avoid an insertelement transformation that induces the opposite extractelement fold (PR26354)

    We would infinite loop because we created a shufflevector that was wider than
    needed and then failed to combine that with the insertelement. When subsequently
    visiting the extractelement from that shuffle, we see that it's unnecessary,
    delete it, and trigger another visit to the insertelement.

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259245 91177308-0d34-0410-b5e6-96231b3b80d8

commit cd30d75375a03a290c6621da13cbab4f10545c56
Author: Tom Stellard <thomas.stellard@amd.com>
Date:   Fri Jan 29 16:45:55 2016 +0000

    Merging r258922:

    ------------------------------------------------------------------------
    r258922 | marek.olsak | 2016-01-27 06:19:45 -0500 (Wed, 27 Jan 2016) |
    12 lines

    AMDGPU/SI: Stoney has only 16 LDS banks

    Summary:
    This is a candidate for stable, along with all patches that add the
    "stoney"
    processor.

    Reviewers: tstellarAMD

    Subscribers: arsenm

    Differential Revision: http://reviews.llvm.org/D16485

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259207 91177308-0d34-0410-b5e6-96231b3b80d8

commit a8a522e4217a621114bedcb1cedee056c59a6273
Author: Tom Stellard <thomas.stellard@amd.com>
Date:   Fri Jan 29 16:45:52 2016 +0000

    Merging r257666:

    ------------------------------------------------------------------------
    r257666 | changpeng.fang | 2016-01-13 15:39:25 -0500 (Wed, 13 Jan 2016) | 2 lines

    AMDGPU/SI: Update ISA version for FIJI

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259206 91177308-0d34-0410-b5e6-96231b3b80d8

commit c3c52626df3d5b9bd06b160450da8335deb24dc8
Author: Daniel Sanders <daniel.sanders@imgtec.com>
Date:   Thu Jan 28 21:05:40 2016 +0000

    Bring back the test-suite export in test-release without bringing back the build failures.

    Summary:
    r257791 disabled the test-suite export since the addition of CMakeLists.txt was
    causing build failures. This patch exports the test-suite again but does so
    outside the source tree so that it isn't included in the Phase[123] builds.

    Reviewers: hans

    Subscribers: llvm-commits

    Differential Revision: http://reviews.llvm.org/D16679

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259093 91177308-0d34-0410-b5e6-96231b3b80d8

commit 72901a8afaae6c9f8ea63ba1c9c9d4699c7eec49
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Jan 28 18:23:25 2016 +0000

    Merging r258971:
    ------------------------------------------------------------------------
    r258971 | spatel | 2016-01-27 11:22:45 -0800 (Wed, 27 Jan 2016) | 26 lines

    [SimplifyCFG] limit recursion depth when speculating instructions (PR26308)

    This is a fix for:
    https://llvm.org/bugs/show_bug.cgi?id=26308

    With the switch to using the TTI cost model in:
    http://reviews.llvm.org/rL228826
    ...it became possible to hit a zero-cost cycle of instructions (gep -> phi -> gep...),
    so we need a cap for the recursion in DominatesMergePoint().

    A recursion depth parameter was already added for a different reason in:
    http://reviews.llvm.org/rL255660
    ...so we can just set a limit for it.

    I pulled "10" out of the air and made it an independent parameter that we can play with.
    It might be higher than it needs to be given the currently low default value of
    PHINodeFoldingThreshold (2). That's the starting cost value that we enter the recursion
    with, and most instructions have cost set to TCC_Basic (1), so I don't think we're going
    to speculate more than 2 instructions with the current parameters.

    As noted in the review and the TODO comment, we can do better than just limiting recursion
    depth.

    Differential Revision: http://reviews.llvm.org/D16637

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259066 91177308-0d34-0410-b5e6-96231b3b80d8

commit 131d76722983cb030c392bcb50bba940e98ea0c6
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Jan 28 18:16:55 2016 +0000

    Merging r258471:
    ------------------------------------------------------------------------
    r258471 | pirama | 2016-01-21 17:16:57 -0800 (Thu, 21 Jan 2016) | 14 lines

    Do not lower VSETCC if operand is an f16 vector

    Summary:
    SETCC with f16 vectors has OperationAction set to Expand but still gets
    lowered to FCM* intrinsics based on its result type.  This patch skips
    lowering of VSETCC if the operand is an f16 vector.

    v4 and v8 tests included.

    Reviewers: ab, jmolloy

    Subscribers: srhines, llvm-commits

    Differential Revision: http://reviews.llvm.org/D15361
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@259064 91177308-0d34-0410-b5e6-96231b3b80d8

commit 82cf8c0ebce3d4cac59da2cc36df0c0cd9730d72
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 27 00:19:52 2016 +0000

    Merging r258891:
    ------------------------------------------------------------------------
    r258891 | hans | 2016-01-26 16:19:05 -0800 (Tue, 26 Jan 2016) | 25 lines

    test-release.sh: Ignore LC_CTYPE in sed invocation on Darwin

    Here, sed is used to prepare object files for comparison via cmp. On my Darwin
    15.4.0 machine, LC_CTYPE is set to UTF-8 (by default, I believe). Under these
    circumstances, anything sed is made to read will be treated as UTF-8, prompting
    it to signal an error if it is not, like so:

    % sed s/a/b/ <(head -n1 /dev/random) >/dev/null; echo $?
    sed: RE error: illegal byte sequence
    1
    %

    To make sed work as expected, I need to set LC_CTYPE to C:

    % env LC_CTYPE=C sed s/a/b/ <(head -n1 /dev/random) >/dev/null; echo $?
    0
    %

    Without this change, sed will exit with an error for every single file that it
    compares between phase 2 and phase 3, thereby making it look as if the
    differences were far larger than they are.

    Patch by Elias Pipping!

    Differential Revision: http://reviews.llvm.org/D16548
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258892 91177308-0d34-0410-b5e6-96231b3b80d8

commit 5eefadb302242035deaf04c5585bb4cd46125deb
Author: Tom Stellard <thomas.stellard@amd.com>
Date:   Tue Jan 26 23:57:01 2016 +0000

    Merging r258386:

    ------------------------------------------------------------------------
    r258386 | thomas.stellard | 2016-01-20 23:28:34 -0500 (Wed, 20 Jan 2016) | 14 lines

    AMDGPU/SI: Pass whether to use the SI scheduler via Target Attribute

    Summary:
    Currently the SI scheduler can be selected via command line option,
    but it turned out it would be better if it was selectable via a Target Attribute.

    This patch adds "si-scheduler" attribute to the backend.

    Reviewers: tstellarAMD, echristo

    Subscribers: echristo, arsenm

    Differential Revision: http://reviews.llvm.org/D16192

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258885 91177308-0d34-0410-b5e6-96231b3b80d8

commit 44fb5881d8edf448d6231a5b8df583aecd6bcd42
Author: Sanjoy Das <sanjoy@playingwithpointers.com>
Date:   Tue Jan 26 22:29:46 2016 +0000

    Merging r258184:
    ------------------------------------------------------------------------
    r258184 | sanjoy | 2016-01-19 12:53:51 -0800 (Tue, 19 Jan 2016) | 20 lines

    [SCEV] Fix PR26207

    In some cases, the max backedge taken count can be more conservative
    than the exact backedge taken count (for instance, because
    ScalarEvolution::getRange is not control-flow sensitive whereas
    computeExitLimitFromICmp can be).  In these cases,
    computeExitLimitFromCond (specifically the bit that deals with `and` and
    `or` instructions) can create an ExitLimit instance with a
    `SCEVCouldNotCompute` max backedge count expression, but a computable
    exact backedge count expression.  This violates an implicit SCEV
    assumption: a computable exact BE count should imply a computable max BE
    count.

    This change

     - Makes the above implicit invariant explicit by adding an assert to
       ExitLimit's constructor

     - Changes `computeExitLimitFromCond` to be more robust around
       conservative max backedge counts
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258869 91177308-0d34-0410-b5e6-96231b3b80d8

commit 4d1ef71f362e014aaaaefeb36abe83c24b578e40
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 26 19:44:49 2016 +0000

    Revert accidental changes from r258805

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258844 91177308-0d34-0410-b5e6-96231b3b80d8

commit 9a498947cdb25737faecfdabcb64848432c49d68
Author: Dimitry Andric <dimitry@andric.com>
Date:   Tue Jan 26 19:43:59 2016 +0000

    Merging r258436:
    ------------------------------------------------------------------------
    r258436 | dim | 2016-01-21 22:57:49 +0100 (Thu, 21 Jan 2016) | 17 lines

    Let test-release.sh checkout subprojects directly into the target tree,
    instead of using symlinks

    Summary:
    In the past I have run into several problems with the way
    `test-release.sh` creates all the subproject directories as siblings,
    and then uses symlinks to stitch them all together.  In some scenarios
    this leads to clang not being able to find header files, etc.

    This patch changes the script so it directly exports into the correct
    target locations for each subproject.

    Reviewers: hans

    Subscribers: emaste, llvm-commits

    Differential Revision: http://reviews.llvm.org/D16420
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258842 91177308-0d34-0410-b5e6-96231b3b80d8

commit 4b85564ba4a41465155b9128a68e5e14fea78365
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 26 19:31:16 2016 +0000

    Merging r258729:
    ------------------------------------------------------------------------
    r258729 | matze | 2016-01-25 14:08:25 -0800 (Mon, 25 Jan 2016) | 13 lines

    X86ISelLowering: Fix cmov(cmov) special lowering bug

    There's a special case in EmitLoweredSelect() that produces an improved
    lowering for cmov(cmov) patterns. However this special lowering is
    currently broken if the inner cmov has multiple users so this patch
    stops using it in this case.

    If you wonder why this wasn't fixed by continuing to use the special
    lowering and inserting a 2nd PHI for the inner cmov: I believe this
    would incur additional copies/register pressure so the special lowering
    does not improve upon the normal one anymore in this case.

    This fixes http://llvm.org/PR26256 (= rdar://24329747)
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258840 91177308-0d34-0410-b5e6-96231b3b80d8

commit db6cb1a90cd0ab35e2dadc97962a5d67742c0bbc
Author: James Molloy <james.molloy@arm.com>
Date:   Tue Jan 26 13:30:49 2016 +0000

    Merging r258690:
    ------------------------------------------------------------------------
    r258690 | jamesm | 2016-01-25 14:49:36 +0000 (Mon, 25 Jan 2016) | 7 lines

    [DemandedBits] Fix computation of demanded bits for ICmps

    The computation of ICmp demanded bits is independent of the individual operand being evaluated. We simply return a mask consisting of the minimum leading zeroes of both operands.

    We were incorrectly passing "I" to ComputeKnownBits - this should be "UserI->getOperand(0)". In cases where we were evaluating the 1th operand, we were taking the minimum leading zeroes of it and itself.

    This should fix PR26266.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258805 91177308-0d34-0410-b5e6-96231b3b80d8

commit 836d2ad83c5e955a23f6e3b78418cb250c95c88b
Author: Hans Wennborg <hans@hanshq.net>
Date:   Mon Jan 25 22:24:50 2016 +0000

    Merging r258406:
    ------------------------------------------------------------------------
    r258406 | vedantk | 2016-01-21 09:04:42 -0800 (Thu, 21 Jan 2016) | 16 lines

    [GCOV] Avoid emitting profile arcs for module and skeleton CUs

    Do not emit profile arc files and note files for module and skeleton
    CU's.

    Our users report seeing unexpected *.gcda and *.gcno files in their
    projects when using gcov-style profiling with modules or frameworks.
    The unwanted files come from these modules. This is not very helpful
    for end-users. Further, we've seen reports of instrumented programs
    crashing while writing these files out (due to I/O failures).

    rdar://problem/22838296

    Reviewed-by: aprantl

    Differential Revision: http://reviews.llvm.org/D15997
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258731 91177308-0d34-0410-b5e6-96231b3b80d8

commit 16f83af7618a4dfb4ef1891b07adb23cc54e4c86
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Jan 22 18:37:31 2016 +0000

    Merging r258416 and r258428:

    ------------------------------------------------------------------------
    r258416 | spatel | 2016-01-21 10:01:57 -0800 (Thu, 21 Jan 2016) | 2 lines

    make helper functions static; NFCI
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r258428 | spatel | 2016-01-21 12:19:54 -0800 (Thu, 21 Jan 2016) | 15 lines

    [LibCallSimplifier] don't get fooled by a fake fmin()

    This is similar to the bug/fix:
    https://llvm.org/bugs/show_bug.cgi?id=26211
    http://reviews.llvm.org/rL258325

    The fmin() test case reveals another bug caused by sloppy
    code duplication. It will crash without this patch because
    fp128 is a valid floating-point type, but we would think
    that we had matched a function that used doubles.

    The new helper function can be used to replace similar
    checks that are used in several other places in this file.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258512 91177308-0d34-0410-b5e6-96231b3b80d8

commit c89d9654310e0f5b1171888c6573d09c9e66d0c4
Author: Hans Wennborg <hans@hanshq.net>
Date:   Fri Jan 22 18:26:38 2016 +0000

    Merging r257886:
    ------------------------------------------------------------------------
    r257886 | jamesm | 2016-01-15 02:36:01 -0800 (Fri, 15 Jan 2016) | 3 lines

    [CodeGenPrepare] Try and appease sanitizers

    dupRetToEnableTailCallOpts(BB) can invalidate BB. It must run *after* we iterate across BB!
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258510 91177308-0d34-0410-b5e6-96231b3b80d8

commit 961a0e424cc7a63ee57cc8506c8a04cbf6012e1f
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 20 21:49:02 2016 +0000

    Merging r258325:
    ------------------------------------------------------------------------
    r258325 | spatel | 2016-01-20 09:41:14 -0800 (Wed, 20 Jan 2016) | 21 lines

    [LibCallSimplifier] don't get fooled by a fake sqrt()

    The test case will crash without this patch because the subsequent call to
    hasUnsafeAlgebra() assumes that the call instruction is an FPMathOperator
    (ie, returns an FP type).

    This part of the function signature check was omitted for the sqrt() case,
    but seems to be in place for all other transforms.

    Before:
    http://reviews.llvm.org/rL257400
    ...we would have needlessly continued execution in optimizeSqrt(), but the
    bug was harmless because we'd eventually fail some other check and return
    without damage.

    This should fix:
    https://llvm.org/bugs/show_bug.cgi?id=26211

    Differential Revision: http://reviews.llvm.org/D16198

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258353 91177308-0d34-0410-b5e6-96231b3b80d8

commit 3acb8d3c6d4e470172fb244c809bc6fdd7948c29
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 20 21:30:57 2016 +0000

    Merging r257940:
    ------------------------------------------------------------------------
    r257940 | djg | 2016-01-15 13:56:40 -0800 (Fri, 15 Jan 2016) | 10 lines

    [SelectionDAG] CSE nodes with differing SDNodeFlags

    In the optimizer (GVN etc.) when eliminating redundant nodes with different
    flags, the flags are ignored for the purposes of testing for congruence, and
    then intersected for the purposes of producing a result that supports the union
    of all the uses. This commit makes SelectionDAG's CSE do the same thing,
    allowing it to CSE nodes in more cases. This fixes PR26063.

    Differential Revision: http://reviews.llvm.org/D15957

    ------------------------------------------------------------------------
    Merging r257942:
    ------------------------------------------------------------------------
    r257942 | djg | 2016-01-15 14:07:35 -0800 (Fri, 15 Jan 2016) | 2 lines

    Remove a now-empty file left behind by r257940.

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258351 91177308-0d34-0410-b5e6-96231b3b80d8

commit 3260476414aa2e03566d205f742220a382f4ce07
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 20 21:14:05 2016 +0000

    Merging r258273:
    ------------------------------------------------------------------------
    r258273 | josepht | 2016-01-19 18:15:15 -0800 (Tue, 19 Jan 2016) | 37 lines

    [Inliner/WinEH] Honor implicit nounwinds

    Summary:
    Funclet EH tables require that a given funclet have only one unwind
    destination for exceptional exits.  The verifier will therefore reject
    e.g. two cleanuprets with different unwind dests for the same cleanup, or
    two invokes exiting the same funclet but to different unwind dests.
    Because catchswitch has no 'nounwind' variant, and because IR producers
    are not *required* to annotate calls which will not unwind as 'nounwind',
    it is legal to nest a call or an "unwind to caller" catchswitch within a
    funclet pad that has an unwind destination other than caller; it is
    undefined behavior for such a call or catchswitch to unwind.

    Normally when inlining an invoke, calls in the inlined sequence are
    rewritten to invokes that unwind to the callsite invoke's unwind
    destination, and "unwind to caller" catchswitches in the inlined sequence
    are rewritten to unwind to the callsite invoke's unwind destination.
    However, if such a call or "unwind to caller" catchswitch is located in a
    callee funclet that has another exceptional exit with an unwind
    destination within the callee, applying the normal transformation would
    give that callee funclet multiple unwind destinations for its exceptional
    exits.  There would be no way for EH table generation to determine which
    is the "true" exit, and the verifier would reject the function
    accordingly.

    Add logic to the inliner to detect these cases and leave such calls and
    "unwind to caller" catchswitches as calls and "unwind to caller"
    catchswitches in the inlined sequence.

    This fixes PR26147.

    Reviewers: rnk, andrew.w.kaylor, majnemer

    Subscribers: alexcrichton, llvm-commits

    Differential Revision: http://reviews.llvm.org/D16319
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258349 91177308-0d34-0410-b5e6-96231b3b80d8

commit 38e40410b1fa6441db511e760bc6ae263a8bbaee
Author: Renato Golin <renato.golin@linaro.org>
Date:   Wed Jan 20 18:01:05 2016 +0000

    Merging r258308: [AArch64] Fix two bugs in the .inst directive

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258326 91177308-0d34-0410-b5e6-96231b3b80d8

commit 870ff87d1cd25f9a2dd01d7c75489a63eca377c2
Author: Quentin Colombet <qcolombet@apple.com>
Date:   Wed Jan 20 01:14:03 2016 +0000

    Merging r258221:
    ------------------------------------------------------------------------
    r258221 | qcolombet | 2016-01-19 15:29:03 -0800 (Tue, 19 Jan 2016) | 8 lines

    [X86] Do not run shrink-wrapping on function with split-stack attribute or HiPE
    calling convention.
    The implementation of the related callbacks in the x86 backend for such
    functions are not ready to deal with a prologue block that is not the entry
    block of the function.

    This fixes PR26107, but the longer term solution would be to fix those callbacks.

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258269 91177308-0d34-0410-b5e6-96231b3b80d8

commit 90fef5a5b6514f60396e81d7fa20581d05ca659b
Author: Quentin Colombet <qcolombet@apple.com>
Date:   Wed Jan 20 01:09:12 2016 +0000

    Merging r258207:
    ------------------------------------------------------------------------
    r258207 | qcolombet | 2016-01-19 14:31:12 -0800 (Tue, 19 Jan 2016) | 1 line

    [MachineFunction] Constify getter. NFC.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258268 91177308-0d34-0410-b5e6-96231b3b80d8

commit 770ec8cf9ae215e26cb6d946b9d533151fe0558d
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 20 00:48:30 2016 +0000

    Merging r257977:
    ------------------------------------------------------------------------
    r257977 | kfischer | 2016-01-15 17:11:33 -0800 (Fri, 15 Jan 2016) | 1 line

    [DwarfDebug] Move MergeValues to .cpp, NFC
    ------------------------------------------------------------------------
    Merging r257979:
    ------------------------------------------------------------------------
    r257979 | kfischer | 2016-01-15 17:15:32 -0800 (Fri, 15 Jan 2016) | 11 lines

    [DwarfDebug] Don't merge DebugLocEntries if their pieces overlap

    Summary:
    Later in DWARF emission we check that DebugLocEntries have
    non-overlapping pieces, so we should create any such entries
    by merging here.

    Fixes PR26163.

    Reviewers: aprantl
    Differential Revision: http://reviews.llvm.org/D16249
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258267 91177308-0d34-0410-b5e6-96231b3b80d8

commit d26a2e75e58f56a289b911c0bf582be4f8f655f1
Author: NAKAMURA Takumi <geek4civic@gmail.com>
Date:   Wed Jan 20 00:32:09 2016 +0000

    [r257857] lli: use llvm::utostr() instead of std::to_string().

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258266 91177308-0d34-0410-b5e6-96231b3b80d8

commit 316ab7147bf233fd6a84977173f99b3fc9a26e0e
Author: NAKAMURA Takumi <geek4civic@gmail.com>
Date:   Wed Jan 20 00:28:22 2016 +0000

    [r257732] Mark remote-JIT tests as XFAIL, as well as win32, for targeting mingw32.

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258265 91177308-0d34-0410-b5e6-96231b3b80d8

commit f5575ecd57c4ab8cdae1a80fecc01029d14fe4e6
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 19 20:49:25 2016 +0000

    Merging r257875:
    ------------------------------------------------------------------------
    r257875 | jamesm | 2016-01-15 01:20:19 -0800 (Fri, 15 Jan 2016) | 11 lines

    [InstCombine] Rewrite bswap/bitreverse handling completely.

    There are several requirements that ended up with this design;
      1. Matching bitreversals is too heavyweight for InstCombine and doesn't really need to be done so early.
      2. Bitreversals and byteswaps are very related in their matching logic.
      3. We want to implement support for matching more advanced bswap/bitreverse patterns like partial bswaps/bitreverses.
      4. Bswaps are best matched early in InstCombine.

    The result of these is that a new utility function is created in Transforms/Utils/Local.h that can be configured to search for bswaps, bitreverses or both. InstCombine uses it to find only bswaps, CGP uses it to find only bitreversals.

    We can then extend the matching logic in one place only.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258180 91177308-0d34-0410-b5e6-96231b3b80d8

commit e12bf2aba135af15b33cca8a8c0fb80189a16b80
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 19 19:28:41 2016 +0000

    Merging r258168:
    ------------------------------------------------------------------------
    r258168 | hans | 2016-01-19 11:21:58 -0800 (Tue, 19 Jan 2016) | 3 lines

    test-release.sh: Use CMake also for Darwin

    This didn't work for 3.7, but hopefully it should work now.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258170 91177308-0d34-0410-b5e6-96231b3b80d8

commit 1618eb04cdfdd3febf77bc67cdac5307e5528b96
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 19 18:53:02 2016 +0000

    Merging r257925, r257929, r257930, and r257997:
    ------------------------------------------------------------------------
    r257925 | mren | 2016-01-15 11:35:42 -0800 (Fri, 15 Jan 2016) | 10 lines

    CXX_FAST_TLS calling convention: fix issue on X86-64.

    When we have a single basic block, the explicit copy-back instructions should
    be inserted right before the terminator. Before this fix, they were wrongly
    placed at the beginning of the basic block.

    I will commit fixes to other platforms as well.

    PR26136
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r257929 | mren | 2016-01-15 12:13:28 -0800 (Fri, 15 Jan 2016) | 10 lines

    CXX_FAST_TLS calling convention: fix issue on AArch64.

    When we have a single basic block, the explicit copy-back instructions should
    be inserted right before the terminator. Before this fix, they were wrongly
    placed at the beginning of the basic block.

    I will commit fixes to other platforms as well.

    PR26136
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r257930 | mren | 2016-01-15 12:24:11 -0800 (Fri, 15 Jan 2016) | 8 lines

    CXX_FAST_TLS calling convention: fix issue on ARM.

    When we have a single basic block, the explicit copy-back instructions should
    be inserted right before the terminator. Before this fix, they were wrongly
    placed at the beginning of the basic block.

    PR26136
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r257997 | mren | 2016-01-16 08:39:46 -0800 (Sat, 16 Jan 2016) | 12 lines

    CXX_FAST_TLS calling convention: fix issue on x86-64.

    %RBP can't be handled explicitly. We generate the following code:
        pushq %rbp
        movq  %rsp, %rbp
        ...
        movq  %rbx, (%rbp)  ## 8-byte Spill
    where %rbp will be overwritten by the spilled value.

    The fix is to let PEI handle %RBP.
    PR26136
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258162 91177308-0d34-0410-b5e6-96231b3b80d8

commit aa96fb86c3304e81c2f53700223d0e795c302276
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 19 18:26:37 2016 +0000

    Merging r257902 (and r257775)

    ------------------------------------------------------------------------
    r257775 | jyknight | 2016-01-14 08:33:21 -0800 (Thu, 14 Jan 2016) | 3 lines

    Revert "Stop increasing alignment of externally-visible globals on ELF platforms."

    This reverts commit r257719, due to PR26144.
    ------------------------------------------------------------------------

    ------------------------------------------------------------------------
    r257902 | jyknight | 2016-01-15 08:33:06 -0800 (Fri, 15 Jan 2016) | 17 lines

    Stop increasing alignment of externally-visible globals on ELF
    platforms.

    With ELF, the alignment of a global variable in a shared library will
    get copied into an executables linked against it, if the executable even
    accesss the variable. So, it's not possible to implicitly increase
    alignment based on access patterns, or you'll break existing binaries.

    This happened to affect libc++'s std::cout symbol, for example. See
    thread: http://thread.gmane.org/gmane.comp.compilers.clang.devel/45311

    (This is a re-commit of r257719, without the bug reported in
    PR26144. I've tweaked the code to not assert-fail in
    enforceKnownAlignment when computeKnownBits doesn't recurse far enough
    to find the underlying Alloca/GlobalObject value.)

    Differential Revision: http://reviews.llvm.org/D16145
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258155 91177308-0d34-0410-b5e6-96231b3b80d8

commit ed504bedd7420790d55e441e35e5449eaa40029e
Author: Hans Wennborg <hans@hanshq.net>
Date:   Tue Jan 19 17:28:24 2016 +0000

    Merging r257905:
    ------------------------------------------------------------------------
    r257905 | hans | 2016-01-15 09:04:45 -0800 (Fri, 15 Jan 2016) | 3 lines

    test-release.sh: Fix clang-tools-extra symlink for CMake build

    The CMake and Autoconf builds want the symlink set up differently.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@258146 91177308-0d34-0410-b5e6-96231b3b80d8

commit c1316b6adfbb17b961a3bee357e728ca0d4d1c96
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Jan 14 23:24:17 2016 +0000

    Merging r257791:
    ------------------------------------------------------------------------
    r257791 | hans | 2016-01-14 11:21:14 -0800 (Thu, 14 Jan 2016) | 4 lines

    Exclude test-suite from CMake builds in test-release.sh

    It's broken. In 3.7 there wasn't a CMake build for test-suite at all,
    so we're not losing something we had before.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257836 91177308-0d34-0410-b5e6-96231b3b80d8

commit 25d64abdb39a834541edbafdc686f371dad58a76
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Jan 14 17:52:28 2016 +0000

    Merging r257730:
    ------------------------------------------------------------------------
    r257730 | majnemer | 2016-01-13 17:20:03 -0800 (Wed, 13 Jan 2016) | 11 lines

    [X86] Don't alter HasOpaqueSPAdjustment after we've relied on it

    We rely on HasOpaqueSPAdjustment not changing after we've calculated
    things based on it.  Things like whether or not we can use 'rep;movs' to
    copy bytes around, that sort of thing.  If it changes, invariants in the
    backend will quietly break.  This situation arose when we had a call to
    memcpy *and* a COPY of the FLAGS register where we would attempt to
    reference local variables using %esi, a register that was clobbered by
    the 'rep;movs'.

    This fixes PR26124.
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257779 91177308-0d34-0410-b5e6-96231b3b80d8

commit 7b9eef037dbacab102881f19826fb04cfe69c7e7
Author: Hans Wennborg <hans@hanshq.net>
Date:   Thu Jan 14 00:23:32 2016 +0000

    ReleaseNotes.rst: a few entries from Rafael

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257725 91177308-0d34-0410-b5e6-96231b3b80d8

commit 53d8ef00d82460b9c8ce08617d91bbce8313d4a3
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 13 21:18:59 2016 +0000

    Merging r257648:
    ------------------------------------------------------------------------
    r257648 | hans | 2016-01-13 10:59:45 -0800 (Wed, 13 Jan 2016) | 1 line

    Fix struct/class mismatch for MachineSchedContext
    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257668 91177308-0d34-0410-b5e6-96231b3b80d8

commit 38fcb6f10f0ae867bfe796f26bf1a336bf0dddf0
Author: Dimitry Andric <dimitry@andric.com>
Date:   Wed Jan 13 19:37:51 2016 +0000

    Merging r257645:
    ------------------------------------------------------------------------
    r257645 | dim | 2016-01-13 19:29:46 +0100 (Wed, 13 Jan 2016) | 22 lines

    Avoid undefined behavior in LinkAllPasses.h

    The LinkAllPasses.h file is included in several main programs, to force
    a large number of passes to be linked in.  However, the ForcePassLinking
    constructor uses undefined behavior, since it calls member functions on
    `nullptr`, e.g.:

          ((llvm::Function*)nullptr)->viewCFGOnly();
          llvm::RGPassManager RGM;
          ((llvm::RegionPass*)nullptr)->runOnRegion((llvm::Region*)nullptr, RGM);

    When the optimization level is -O2 or higher, the code below the first
    nullptr dereference is optimized away, and replaced by `ud2` (on x86).

    Therefore, the calls after that first dereference are never emitted.  In
    my case, I noticed there was no call to `llvm::sys::RunningOnValgrind()`!

    Replace instances of dereferencing `nullptr` with either objects on the
    stack, or regular function calls.

    Differential Revision: http://reviews.llvm.org/D15996

    ------------------------------------------------------------------------

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257660 91177308-0d34-0410-b5e6-96231b3b80d8

commit 9faaefea9cbef6453486ed825c1ca4305bf68324
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 13 19:03:44 2016 +0000

    Drop 'svn' suffix from version.

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257651 91177308-0d34-0410-b5e6-96231b3b80d8

commit 5ab5731312b6a8736fbe7fad1cb10f384b3a295e
Author: Hans Wennborg <hans@hanshq.net>
Date:   Wed Jan 13 17:34:56 2016 +0000

    Creating release_38 branch off revision 257626

    git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_38@257630 91177308-0d34-0410-b5e6-96231b3b80d8

commit 1526147c0ad76667de046ef168d5cc5eee381bb7
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jan 12 12:40:37 2016 +0000

    Bug fix to include setSuccessor method on reattach instruction.

commit 2b1b34e00cbc085a4a9a290c65fffaabae9517dc
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Dec 31 04:05:48 2015 +0000

    Add -instrument-cilk support to detach2cilk, cilkabi

commit 4328b4468c0e42c1f89f5212e1386c38394edf20
Merge: 062301f913b 8a32dc47d61
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Dec 30 01:45:54 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 062301f913b5ac657607f0c758392ac8a18d5c13
Merge: 9893cc49b22 48a798cb4b4
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 29 22:23:46 2015 +0000

    Merge branch 'tb-scratch' of github.com:taekwonbilly/Parallel-IR into tb-scratch

commit 9893cc49b223291071ea6633cd3f5c376acce9dd
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Dec 29 22:22:01 2015 +0000

    SimplifyCFG now removes unncessary Sync instructions.

commit 48a798cb4b473470ad6ceaa6cc3e45dd569d0627
Merge: 54dbddeaec7 8d00ea68834
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Nov 11 10:50:51 2015 -0500

    for counting

commit 54dbddeaec7fa2bcdb3ad906c2cb99232342f00b
Merge: 19481e914d1 88d51ce445e
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Nov 11 10:18:55 2015 -0500

    moded

commit 8d00ea68834b61ce260b8111beb594cbdc8c78b9
Merge: 2ae39eb69c5 65cad952e45
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Nov 6 11:51:30 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 2ae39eb69c54cfb2206514873bca9cb1ac3738b0
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Nov 5 14:58:05 2015 +0000

    [SimplifyCFG] Fixed bug where empty reattach blocks with multiple predecessors would crash this pass.

commit 7bd0f59e1aa75abe8a238d1ec166d6148722ebdd
Merge: 8ae8e06e3cd c135da21a3c
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Nov 4 02:12:41 2015 +0000

    Merge branch 'billy-scratch' of github.com:taekwonbilly/Parallel-IR into tb-scratch

commit 8ae8e06e3cdf762ce50de096115ecfac5c998b63
Merge: a9530cd93a2 7e6636cb71f
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Nov 4 01:26:22 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit c135da21a3cca833224099aeeac85aad0ec5144d
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Mon Nov 2 23:13:23 2015 -0500

    all cleaned up & ready to go

commit a9530cd93a293b6e21665883a74b42859061acd8
Merge: 329f5fad3f7 1965754e592
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 16 14:24:35 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 329f5fad3f72dd84a3e4cf5818512a6b7e81c657
Merge: e0717ad48cc 600b09339de
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 16 00:37:12 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit e0717ad48cc7c447b4f1159116b06ff82c4efdd3
Merge: 20e95d87b5e 4b6405d130b
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 16 00:36:05 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 20e95d87b5e8234390f2b4cc6ef46a5ebea58e0c
Merge: 44d4e427c7f bcd41c02dde
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Oct 13 16:57:43 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit cac7ff23aac4127106c74d7cdaa5b6f11d3d5e00
Merge: ab253e4510c 387b1f61aad
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Oct 13 12:34:49 2015 -0400

    Merge branch 'master' of github.com:taekwonbilly/Parallel-IR into billy-scratch

commit ab253e4510c21e111e4c56fda345c19d3b232650
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Oct 13 12:34:31 2015 -0400

    cache loop2cilk

commit 44d4e427c7f008295af785fbad29857952be6d9a
Merge: 387b1f61aad 938c3d3164e
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Oct 13 12:52:44 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 387b1f61aad986ddc9032d82e2e48e9c5e1b064d
Merge: 81e2fd12aea 3d58b720c31
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 2 19:47:27 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 81e2fd12aea84c2ab59cd73cbcad2665a947ce0b
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 2 19:43:24 2015 +0000

    Adding Detach2Cilksan pass to enable Cilksan race detection.

commit 7a634e24c5bc7a520e8979646da17c09895f5425
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Oct 2 19:42:47 2015 +0000

    Some debugging of Detach2Cilk

commit cdf14afd5eeb21dedc32c3a62b1f76af95016974
Merge: 25f43658061 36caf0659ff
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 22 17:53:11 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 25f436580618875268ef313894e05802617bbdf0
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 22 04:00:34 2015 +0000

    Fixing loop rotation to prevent it from destroying sync instructions.

commit 8ec1e7597748edd42654657f992aa4209bd04cf9
Merge: 4fc3d85490a dabf510ba1b
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Sep 20 19:06:45 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 4fc3d85490a81d6adbd21b5f66646a9f397fe333
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Sep 20 19:06:24 2015 +0000

    Fixed GVN to handle scalarPRE around detach/reattach and to abort load PRE in the event of an aliased access from a detach or sync.

commit dc7cd94ca46ba477e113d2844de893b82b95b081
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Sep 20 19:05:06 2015 +0000

    Updated AliasAnalysis to analyze detached blocks for aliasing information for detach and sync instructions.

commit 421d2351ba4e14ff211a3c6cbe9258ccddf19afa
Merge: 54b97afc6bc 29f50e97835
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Sep 13 12:11:13 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 54b97afc6bc145d1e28a8a3c94de524d809cddf1
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Sep 9 20:25:28 2015 +0000

    Making syncs look like fences, in order to fix memory analysis issues.

commit 4420c17e34959d2a33ba4c9fd9ae5ff6066f797a
Merge: e6d3b51ad7d 3c76435341d
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Sep 9 01:12:29 2015 +0000

    Merge branch 'billy-scratch' of github.com:taekwonbilly/Parallel-IR into tb-scratch

commit e6d3b51ad7de5aaece38701cbe0b9401f481b13c
Merge: eaa3d3ce261 9e01a11e67c
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 8 21:47:52 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 3c76435341d943764ecafb324971a254c95b39df
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Sep 8 16:40:32 2015 -0400

    Working parallel opt pass

commit eaa3d3ce261db5812277ba6cd250ce501f77849c
Merge: d9eeab4f9c8 3d88beedefc
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 8 17:14:47 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit d9eeab4f9c8bd662a771d87e73f61165c12cd14b
Merge: f09f6e7a51b 7e316839810
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 8 15:49:47 2015 +0000

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 5f20c20dcf53f27e56915263e99d810bbf403697
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Mon Sep 7 22:05:31 2015 -0400

    Semi-working cilk pass

commit f09f6e7a51b1b270a48d2f66312ff282f1ad6959
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Sep 4 12:13:17 2015 -0400

    Fixed build problems with last merge.

commit 8b666563572297a50f9a17efbd060e8f780f0f04
Merge: abe3f70de04 2354b37ae03
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Sep 4 11:40:09 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 41ddcdf5d8e40544ece73167368487f0195b1b5f
Merge: fea705e7114 abe3f70de04
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Sep 1 23:17:47 2015 -0400

    Merge branch 'tb-scratch' of github.com:taekwonbilly/Parallel-IR into billy-scratch

commit fea705e71145c13d37dcedf6b260ed38d75b7ad1
Merge: dd9331be0b0 19481e914d1
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Sep 1 17:14:52 2015 -0400

    Merge branch 'tb-scratch' into billy-scratch

commit dd9331be0b0f2c6172666774f3f9d3fb17121154
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Sep 1 17:13:27 2015 -0400

    Commit detach pass before merge

commit abe3f70de0450a6ff4d169e2f8a7c884f38b5b43
Merge: 61fde862bba ac515c40878
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Sep 1 16:59:07 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 61fde862bba820f143ea0545dc1804fe53523efc
Merge: 19481e914d1 9907691f42a
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Aug 30 09:37:44 2015 -0400

    Merge branch 'billy-scratch' of github.com:taekwonbilly/Parallel-IR into tb-scratch

commit 19481e914d1b1c4ee1db106d8f01b986ba4f90ae
Merge: fadec4720ee 2b5188b98a3
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Aug 30 08:51:34 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit fadec4720ee7b66c5f4a362e2f0e0e8b2c127ce6
Merge: 4fcaa4205d2 43928f79096
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 29 12:03:38 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 4fcaa4205d29c0c7c96d5e422f16db53db786e82
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 29 12:03:11 2015 -0400

    Fixed bug where JumpThreading would attempt to split reattach edges.

commit 6342321c427d73af4fafe79c88d60d5945d192e2
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 29 12:02:17 2015 -0400

    Fixed bug where SCCP did not recognize detach/reattach/sync.

commit cd5c25c6646f9fa4472be7f4148e938b3db180fc
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Aug 28 18:12:45 2015 -0400

    Removing dead code from SROA.

commit 613e58985cd9077134dc120d465bbf4ad7c624b1
Merge: 16929701716 21f084aa722
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Aug 28 18:07:45 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 9907691f42ac9a51278d9a4fb20496f1a08531cc
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Thu Aug 27 14:58:29 2015 -0400

    Add temporary hack to enable compiling serial version to executable

commit 42a2eef9caf19027aac8829f2e90cc3194e87fe4
Merge: 703f88a7461 2d184c72270
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Aug 26 16:57:08 2015 -0400

    Merge branch 'tb-scratch' into billy-scratch

commit 16929701716110895498f4d5528c740355545472
Merge: 2d184c72270 4abce6e698a
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 25 14:00:34 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into tb-scratch

commit 2d184c7227076f1843ef28ab46c9a6736cb5faea
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Aug 23 11:49:32 2015 -0400

    Relaxed need for commutativity in serial TRE.

commit aecdc8f291e3faa379ec24337be337095a685ea0
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Aug 23 11:49:03 2015 -0400

    Fixed bug in BitcodeWriter with reattach causing opt to crash when emitting bitcode.

commit f7f1cce493e65e181225f5d439cfdc1798717e2e
Merge: 45d7087de1c 8724a428dfd
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 22 09:50:44 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 45d7087de1c8bc1360e107a30c937d9b24189f49
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 22 09:43:10 2015 -0400

    Draft enhancement to accumulator TRE to use identity values.

commit 85eda242bd0b50027d4859450206d336e3e585f5
Merge: f135205b97a 0d125ca11e9
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 18 11:25:00 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit f135205b97a8352113ff27c8fa8158aade75254a
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 18 10:40:50 2015 -0400

    Adding 'getIdentity()' method to Instruction to enhance serial TRE.

commit fe40d5f2a3d392c9836968fb0c8ba3df1ebc908c
Merge: d3cdbb9137e 378e97e50c4
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Aug 17 08:52:52 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit d3cdbb9137e07f806ce69ae7f327749694b7b8b2
Merge: 653d0bbdd47 126b405bec6
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 15 11:33:43 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 653d0bbdd47f7c8520941a9ea1ca5ce2d431bda5
Merge: 99611974297 26e17390798
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Aug 14 09:25:49 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 99611974297552647706e46eb290be13f1ee6a82
Merge: 4bf70c75ac9 22af77d94f3
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 13 12:36:36 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 4bf70c75ac9f1d5eee6b5c2cbfbdb9b5d0de8f3b
Merge: 4dec88872b7 a5ccfee2752
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 11 13:31:22 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 4dec88872b7e91e3f680a304b98ee3f197f5e9db
Merge: e2aac9890d9 abdf937a221
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Aug 10 12:53:34 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit e2aac9890d934cff8b1f09d5c31fa6c804b80bb0
Merge: 8b8574d13a1 73b16a70f16
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Aug 8 09:12:17 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 8b8574d13a13cab91984d55cb78ebfae7caaf941
Merge: 2ee8648835e 1962b1b6b7e
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Aug 7 09:04:50 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 2ee8648835e211ba1a93501acb5ece9f3d5d406b
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 6 08:53:21 2015 -0400

    Bug fix on marking Sync instructions as potentially reading or writing memory

commit 156cf024ecde0d1a725e32239c3057c71297fcfa
Merge: 7d823a9c882 7809bb2e968
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 6 08:08:36 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 7d823a9c882be773768c6c38d92cad7da9880b2f
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Aug 6 08:08:02 2015 -0400

    Adding SyncInst to set of instructions that may read/write memory.

commit 383d9f685189d8294df1f988e7b2c328b2227873
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 4 15:15:40 2015 -0400

    Fixed typos from previous merge.

commit 90a25b1e5633c00cec6a5dd77b998aeb9bfbfc19
Merge: 7907e1dbfd7 a639e155a28
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Aug 4 14:30:25 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 7907e1dbfd714cc121978597e0e552b1aa6eb195
Merge: 9819737b739 c71235ab7d7
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Jul 31 08:49:30 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 9819737b7396013f3d6dee738b070f11b1a52e8c
Merge: 2c1c7bc0320 dc9125e8d13
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 29 08:48:13 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 2c1c7bc0320cf3d5a74b2ad8cf91f24fa641da97
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 29 08:47:45 2015 -0400

    Adding function to SimplifyCFG to elide detach statements whose continuation immediately syncs.

commit c950f20aa21eca8300eed7b10f98e4b61109311d
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 28 10:48:23 2015 -0400

    Added optimization to remove trivial reattach blocks.

commit 86df0ba3770a03a8271a5bba7f1a3708b3f0d153
Merge: 3fbb3bcf4cb bf26b3fcaec
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 27 08:22:30 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 3fbb3bcf4cbbe96c286774917025664dd8e2de80
Merge: 7bb5864b2ad 52f969b0298
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jul 23 08:57:48 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 7bb5864b2ad318eb969b7f8d78e6d5171a8b9cbc
Merge: 9a2143e2643 717d8ad6cf4
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 22 08:02:57 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 9a2143e26433557e7f1eac221099bd037e487e80
Merge: c9d4623ac37 c721349466d
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 21 08:42:46 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit c9d4623ac37b0ba06e727dc71df3ec559a267762
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 20 15:56:29 2015 -0400

    Don't perform PRE across a detach or reattach, as it requires splitting a critical edge.

commit e1df337ce92636114885f3268afaa571e279bcb2
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 20 15:55:53 2015 -0400

    Detach-reattach pairs create unsplittable critical edges.  Add some asserts to check that we don't try to split those edges.

commit 48ec13d545fde4c80f86132b330dec9c672c29b3
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 20 13:32:44 2015 -0400

    Minor edit to instruction combining to avoid pessimization of moving code after a sync.

commit 46d9cfe4c634c7229c16623ca17f0b27d3c7ad28
Merge: c99bacd4cec 96d9043a78b
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 20 10:53:45 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit c99bacd4cecc8f6a9b0f159d957c81ca90a53c06
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 20 10:50:26 2015 -0400

    Updating existing optimization passes to generate correct code around detach/reattach/sync instructions.  Tested on fib and simple race example codes.  Some passes, such as redundant instruction combining, are still pessimizations for these parallel codes.

commit bf96714f54abff14ce58abec408cafb5367ab0fe
Merge: c8594201bba 591adee23bf
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Jul 17 09:14:14 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit c8594201bba51bac12bf581ec5f11aff5e767f9b
Merge: c10991b43d5 72400f8d508
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jul 16 08:59:20 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit c10991b43d5dedafd23d7579635da4e111fd598c
Merge: 1d47de608d6 4aa2f4514cc
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 15 08:16:18 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 1d47de608d6e59908f715569137f5e2dac1f339a
Merge: 3a70241cdea 815d6131a4d
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 14 09:19:14 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 3a70241cdea09232c8e26cfe42e56fac598ed8ba
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jul 12 08:24:32 2015 -0400

    Updated PromoteMemoryToRegister to properly handle reattach, specifically, to avoid promoting alloca's if doing so would require a Phi node to inherit register state through a reattach.

commit 51d54d96cc3cdaec661ea2268e8dd6294b22375a
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jul 12 08:23:11 2015 -0400

    Adjusting reattach to look more like a branch.

commit d39d1f75be719678706e403c64d1a53f9387ef98
Author: TB Schardl <neboat@mit.edu>
Date:   Sun Jul 12 08:22:20 2015 -0400

    Updated comments in IRBuilder

commit 08f1f890d00a14f4ffccdf7da44b8c7b0e5daa12
Merge: 3fa3c489669 1e3fa768c01
Author: TB Schardl <neboat@mit.edu>
Date:   Sat Jul 11 07:59:42 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 3fa3c489669220cef599f61adb52c0b3eba4bc0c
Merge: 48100a712f7 e57b60a7f96
Author: TB Schardl <neboat@mit.edu>
Date:   Fri Jul 10 08:48:24 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 48100a712f7dddd6578ec0f93fd55ae5ddc033ce
Merge: 72a88786c60 86b4ed2fc40
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jul 9 09:01:23 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 703f88a7461d9741c5d0203c02e702f48cda74e7
Merge: 5c355339f57 8e3d42ecb81
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 8 21:51:37 2015 -0700

    Merge branch 'tb-scratch' of github.com:taekwonbilly/Parallel-IR into billy-scratch

commit 5c355339f57181fbf8ce8e665ce4a5e1b18a6a35
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Jul 8 21:50:21 2015 -0700

    fix merge error

commit 72a88786c604e0c99dace11e7ab02b9bea53c7c4
Merge: ab1078ca539 080d7a819f4
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 8 07:54:34 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit ab1078ca5394d4a132b9bfef2b45fe9936355c62
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 7 23:59:33 2015 -0400

    Rework reattach to take a basic block as an argument.  Reattach is therefore like a break, while not being a break.

commit 189cbf6873ffb4880a10098341abdc18447d38d3
Merge: 8e3d42ecb81 7b7c81cd353
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 7 13:45:38 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 8e3d42ecb81ed3d9b8a9bc45e17ba151aaba45fc
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 7 08:58:35 2015 -0400

    Initial hack to disallow SimplifyCFG from removing sync instructions

commit 738e14f4a4dcb70e7e9e6ff1e0727b05ce14c008
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 7 08:57:47 2015 -0400

    Fix comments on SyncInst

commit bf1508cc4427479a10092210237db6678c1ef6d5
Merge: 19e947bd14f 2822246ecee
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jul 7 08:18:24 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 19e947bd14f9b9e718ab634481a0a0d96962b216
Merge: de195a8462b a25ee390b55
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jul 6 08:24:00 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit bb180502346ef66f459596d49bc26c15dc822f88
Merge: c6662084d9e de195a8462b
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Sun Jul 5 21:36:33 2015 -0700

    Merge branch 'tb-scratch' into billy-scratch

    Conflicts:
    	include/llvm/Bitcode/LLVMBitCodes.h
    	lib/AsmParser/LLParser.cpp
    	lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    	lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
    	lib/IR/Instructions.cpp

commit de195a8462b52201728b19904a7ff895e3c2b8a0
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jul 2 15:04:46 2015 -0400

    Temporary sync instruction, in order to develop dominance analysis for CFG's with parallel control dependencies.

commit 738db4461c0b4305c31f9feab72003012c2dcea8
Merge: 02ff4acf5a2 e4e6f29c93d
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jul 2 08:52:48 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 02ff4acf5a260ed830edf5f0764c49f3ce5bdfda
Merge: 999aed1e3d0 7e6843cbd68
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 1 09:43:34 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 999aed1e3d0c0df3d4d3d8f5b4ebbe7181834cba
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 1 09:42:21 2015 -0400

    Teaching SCCP about 'detach' and 'reattach', such that optimization passes can run on codes with these IR instructions.

commit d2f3f1e9b8c80feb8621e3897998a24c68365bed
Merge: ea299f63c15 37cb5f1c2db
Author: TB Schardl <neboat@mit.edu>
Date:   Wed Jul 1 08:33:51 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit ea299f63c158dd1b90dcef36729f361c69f54505
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jun 30 08:27:46 2015 -0400

    Renamed 'spawn' to 'detach'

commit f88a6553ebb86f8d5304a7b8df238b2274d936cd
Merge: 3b6df76c9a1 a5106ca54d0
Author: TB Schardl <neboat@mit.edu>
Date:   Tue Jun 30 08:03:44 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit 3b6df76c9a154c3ae22fe89569dfdac23637d12b
Merge: e62bd55cd9d e8f07a7eb39
Author: TB Schardl <neboat@mit.edu>
Date:   Mon Jun 29 09:50:59 2015 -0400

    Merge branch 'master' of http://llvm.org/git/llvm into pir

commit e62bd55cd9d749090f9137363ae55ada11a2eb4e
Merge: 4dc79856c77 43e99f618db
Author: William Moses <taekwonbilly@gmail.com>
Date:   Sun Jun 28 16:13:19 2015 -0700

    Merge pull request #1 from taekwonbilly/billy-scratch

    Add reattach instruction

commit c6662084d9ecac843815ba39681d6ad2b3c3faaf
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Sun Jun 28 16:09:53 2015 -0700

    allow to compile

commit 43e99f618db80683c40b98110a9320fb88f2b75f
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Sat Jun 27 13:20:11 2015 -0700

    add token

commit 1a4a51b9510224c583acc08555807713a26277e2
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 25 22:35:46 2015 -0700

    Reattach

commit 5861430d7fe8c36d01e42d5a79765232d3733a55
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Thu Jun 25 22:13:27 2015 -0700

    Reattach

commit 4dc79856c77887cd506b15fee5793608071c7b0d
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jun 25 14:08:59 2015 -0400

    Cleanup: remove unnecessary space

commit 592fd5576cc26e3a0ba7efe4918b0c8f94c54b0f
Merge: 08297c02e75 eebe475623c
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jun 25 13:49:16 2015 -0400

    Merge remote-tracking branch 'origin/tb-scratch' into pir

commit 08297c02e75ec7416751d443a99239d464c90061
Author: TB Schardl <neboat@mit.edu>
Date:   Thu Jun 25 13:46:26 2015 -0400

    Porting spawn instruction to current LLVM master.  Added CreateSpawn to IRBuilder.

commit eebe475623c877375a6718b362a76e2bd8843e11
Author: TB Schardl <neboat@mit.edu>
Date:   Wed May 27 10:00:32 2015 -0400

    cleaning up directory to support compilation on my system within a separate build directory

commit 41059692e83eacd80f90f7df15510f97ae7c679d
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue May 26 18:27:48 2015 -0400

    fix

commit 21846df31a8b5b45b82781e8f8a6eb8c9c8dcb0f
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue May 26 17:55:07 2015 -0400

    rm scruff

commit d3d85e53fb33660f44a60f9e1c04c133596a7344
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue May 26 13:13:00 2015 -0400

    cleanup

commit 8cc15c93dcee39782e92168f85e67fb7db46d069
Merge: 218888afe22 f3fb567248e
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue May 26 13:11:33 2015 -0400

    Merge branch 'master' of github.com:taekwonbilly/Parallel-IR

    fix issue

commit 218888afe22e6c297f19a5641809492429be18a7
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue May 26 13:09:58 2015 -0400

    fixed

commit f3fb567248ece821dd2cd77008d1be0c385e78b0
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed May 20 16:45:43 2015 -0400

    ud

commit 8721e720eeb689bf1e9f3f401a4aa851725cc126
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Apr 15 09:57:41 2015 -0400

    reset

commit b1dd73bcb3e3adc89c78acf620b81a7271f261b3
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Mon Apr 13 10:13:29 2015 -0400

    Last commit before change syntax

commit 5cdcb6294493acf8bf10274c3a4a6f1f70c6de36
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Mon Apr 6 12:05:40 2015 -0400

    Updated llvm / added Future Type

commit 2ce961b4e05eab9cb04b59e73ee1209b74e39524
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Wed Apr 1 11:04:13 2015 -0400

    update llvm

commit 5a8e342deb6ff3f9535890096b76731028740219
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Mar 17 22:55:24 2015 -0400

    Added llvm src

commit 140e15b2bddcc72a1a07b1dce8b84ae00f371e55
Author: Billy Moses <taekwonbilly@gmail.com>
Date:   Tue Mar 17 22:11:09 2015 -0400

    first commit

Updated clang submodule

Code cleanup to reduce diff against mainline LLVM.

Fixes to address several failing LLVM regression tests.

Based on the SyncElimination tests, specifically "for2.ll," it appears
that SyncElimination removes sync instructions that are not safe to
remove.  One relevant test has been updated to note this problem and
marked "XFAIL."

[CodeGen] Reverting an earlier change to SelectionDagISel for Cilk codes.

Previously, to fix an “rbp/rsp issue” with Cilk codes,
SelectionDagISel was changed to set a flag in functions that expose
“returns twice”, in order to make those functions appear to contain
variable sized objects.  Setting this flag causes LLVM regression test
“CodeGen/X86/setjmp-spills” to fail.  Setting related flags, such as
“HasOpaqueSPAdjustment” through their existing public interface also
causes the same regression test to fail.  In addition, I don’t see any
rbp/rsp issues with Cilk codes when SelectionDagISel is does not set
any such flag.  For these reasons, I'm removing this previous change
to SelectionDagISel.

[PassManager] Reworking Tapir modification to PassManagerBuilder to ensure that Sanitizer instrumentation passes run only once.  This change should also help improve Tapir's compatibility with LTO.

[TSan] Reverting change to TSan instrumentation, which was causing a test to fail.  We will need to introduce a new instrumentation pass specifically for CilkSan.

Updated clang submodule

[CilkSanitizer] Added custom instrumentation pass for CilkSan.

[CilkSanitizer] Added instrumentation of memory intrinsics and atomics.  Added simple optimization to elide instrumentation of non-captured pointers in serial functions.

Updated clang submodule.

[CilkSanitizer] Improved analysis for avoiding instrumentation.

[CilkSanitizer] Allow CilkSanitizer to handle a larger variety of memory access sizes and to properly ignore memory accesses of illegal sizes.

[AliasAnalysis] Fixed compiler warning.

[CSI] Store multiple filenames when multiple files are compiled together.

Updated clang submodule

Remove .travis.yml
---
 llvm/.circleci/config.yml                     |   30 +
 llvm/.gitignore                               |    5 +
 llvm/898/sync_elimination_pfor_mb/main.c      |   35 +
 llvm/898/sync_elimination_pfor_mb/makefile    |   16 +
 llvm/LICENSE.TXT                              |    9 +
 llvm/README.md                                |   24 +
 llvm/include/llvm-c/Core.h                    |   24 +-
 llvm/include/llvm-c/Initialization.h          |    1 +
 llvm/include/llvm-c/Transforms/Tapir.h        |   50 +
 llvm/include/llvm/Analysis/AliasAnalysis.h    |   21 +
 llvm/include/llvm/Analysis/DetachSSA.h        |  827 ++++++
 llvm/include/llvm/Bitcode/LLVMBitCodes.h      |    3 +
 .../llvm/CodeGen/GlobalISel/IRTranslator.h    |    6 +
 llvm/include/llvm/IR/BasicBlock.h             |    1 +
 llvm/include/llvm/IR/DerivedTypes.h           |    4 +
 llvm/include/llvm/IR/IRBuilder.h              |   20 +
 llvm/include/llvm/IR/InstVisitor.h            |   12 +
 llvm/include/llvm/IR/Instruction.def          |  141 +-
 llvm/include/llvm/IR/Instruction.h            |    1 +
 llvm/include/llvm/IR/Instructions.h           |  249 ++
 llvm/include/llvm/IR/Intrinsics.td            |    7 +
 llvm/include/llvm/IR/Value.def                |    4 +
 llvm/include/llvm/InitializePasses.h          |   10 +
 llvm/include/llvm/LinkAllPasses.h             |    8 +
 llvm/include/llvm/Transforms/CSI.h            |  610 +++++
 .../llvm/Transforms/IPO/PassManagerBuilder.h  |   14 +
 .../include/llvm/Transforms/Instrumentation.h |   21 +-
 llvm/include/llvm/Transforms/Scalar.h         |    6 +
 .../include/llvm/Transforms/Scalar/LoopFuse.h |  130 +
 llvm/include/llvm/Transforms/Scalar/SROA.h    |    1 +
 llvm/include/llvm/Transforms/Tapir.h          |   68 +
 llvm/include/llvm/Transforms/Tapir/CilkABI.h  |  368 +++
 .../llvm/Transforms/Tapir/LoopSpawning.h      |   37 +
 llvm/include/llvm/Transforms/Tapir/Outline.h  |   88 +
 .../llvm/Transforms/Utils/BasicBlockUtils.h   |    6 +
 .../llvm/Transforms/Utils/ModuleUtils.h       |    7 +
 .../llvm/Transforms/Utils/PromoteMemToReg.h   |    1 +
 .../llvm/Transforms/Utils/SSAUpdater.h        |    5 +
 .../llvm/Transforms/Utils/SSAUpdaterImpl.h    |   79 +-
 .../llvm/Transforms/Utils/TapirUtils.h        |   53 +
 llvm/lib/Analysis/AliasAnalysis.cpp           |  123 +-
 llvm/lib/Analysis/Analysis.cpp                |    2 +
 llvm/lib/Analysis/CMakeLists.txt              |    1 +
 llvm/lib/Analysis/DetachSSA.cpp               | 1082 ++++++++
 llvm/lib/Analysis/MemorySSA.cpp               |    2 +-
 llvm/lib/AsmParser/LLLexer.cpp                |    3 +
 llvm/lib/AsmParser/LLParser.cpp               |   86 +
 llvm/lib/AsmParser/LLParser.h                 |    3 +
 llvm/lib/AsmParser/LLToken.h                  |    5 +
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |   53 +
 llvm/lib/Bitcode/Writer/BitcodeWriter.cpp     |   25 +
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |   56 +
 llvm/lib/CodeGen/MachineSSAUpdater.cpp        |   16 +
 llvm/lib/CodeGen/MachineSink.cpp              |   62 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   64 +
 .../SelectionDAG/SelectionDAGBuilder.h        |    6 +
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |    3 +
 llvm/lib/IR/AsmWriter.cpp                     |   23 +
 llvm/lib/IR/BasicBlock.cpp                    |   42 +
 llvm/lib/IR/Instruction.cpp                   |    5 +
 llvm/lib/IR/Instructions.cpp                  |  186 ++
 llvm/lib/IR/Type.cpp                          |    7 +
 llvm/lib/IR/Verifier.cpp                      |    7 +
 llvm/lib/Passes/PassBuilder.cpp               |    1 +
 llvm/lib/Passes/PassRegistry.def              |    3 +
 llvm/lib/Transforms/CMakeLists.txt            |    1 +
 llvm/lib/Transforms/IPO/LLVMBuild.txt         |    2 +-
 .../lib/Transforms/IPO/PassManagerBuilder.cpp |  119 +
 .../InstCombine/InstCombineCalls.cpp          |    9 +
 .../InstCombineLoadStoreAlloca.cpp            |    1 +
 .../InstCombine/InstructionCombining.cpp      |   11 +-
 .../Instrumentation/AddressSanitizer.cpp      |    7 +
 .../Transforms/Instrumentation/CMakeLists.txt |    2 +
 .../Instrumentation/CilkSanitizer.cpp         | 1164 ++++++++
 .../ComprehensiveStaticInstrumentation.cpp    |  982 +++++++
 .../Instrumentation/Instrumentation.cpp       |    2 +
 llvm/lib/Transforms/LLVMBuild.txt             |    2 +-
 llvm/lib/Transforms/Scalar/CMakeLists.txt     |    1 +
 llvm/lib/Transforms/Scalar/GVN.cpp            |   61 +-
 llvm/lib/Transforms/Scalar/JumpThreading.cpp  |   18 +-
 llvm/lib/Transforms/Scalar/LICM.cpp           |   30 +
 llvm/lib/Transforms/Scalar/LoopFuse.cpp       |  561 ++++
 llvm/lib/Transforms/Scalar/LoopRotation.cpp   |  597 ++++
 llvm/lib/Transforms/Scalar/SCCP.cpp           |   75 +
 llvm/lib/Transforms/Scalar/SROA.cpp           |   40 +-
 llvm/lib/Transforms/Scalar/Scalar.cpp         |    1 +
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |   69 +
 .../Scalar/TailRecursionElimination.cpp       |   98 +
 llvm/lib/Transforms/Tapir/CMakeLists.txt      |   18 +
 llvm/lib/Transforms/Tapir/CilkABI.cpp         | 1344 +++++++++
 llvm/lib/Transforms/Tapir/LLVMBuild.txt       |   22 +
 llvm/lib/Transforms/Tapir/LoopSpawning.cpp    | 2413 +++++++++++++++++
 llvm/lib/Transforms/Tapir/LowerToCilk.cpp     |  219 ++
 llvm/lib/Transforms/Tapir/Outline.cpp         |  379 +++
 llvm/lib/Transforms/Tapir/RedundantSpawn.cpp  |   87 +
 llvm/lib/Transforms/Tapir/SmallBlock.cpp      |   68 +
 .../lib/Transforms/Tapir/SpawnRestructure.cpp |   48 +
 llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp   |   96 +
 llvm/lib/Transforms/Tapir/SyncElimination.cpp |  273 ++
 llvm/lib/Transforms/Tapir/Tapir.cpp           |   43 +
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp |   25 +-
 .../Transforms/Utils/BreakCriticalEdges.cpp   |   45 +
 llvm/lib/Transforms/Utils/CMakeLists.txt      |    1 +
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |   39 +-
 llvm/lib/Transforms/Utils/LoopSimplify.cpp    |    6 +
 llvm/lib/Transforms/Utils/LoopUnroll.cpp      |   10 +
 llvm/lib/Transforms/Utils/LoopUtils.cpp       |   61 +
 llvm/lib/Transforms/Utils/Mem2Reg.cpp         |   25 +-
 llvm/lib/Transforms/Utils/ModuleUtils.cpp     |   18 +
 .../Utils/PromoteMemoryToRegister.cpp         |  123 +-
 llvm/lib/Transforms/Utils/SSAUpdater.cpp      |   66 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  152 ++
 llvm/lib/Transforms/Utils/TapirUtils.cpp      |  318 +++
 .../Transforms/Vectorize/LoopVectorize.cpp    |   10 +
 llvm/microbenchmarks/everything/everything.c  |   32 +
 llvm/microbenchmarks/everything/everything.ll |  118 +
 llvm/microbenchmarks/everything/simple.c      |   15 +
 llvm/microbenchmarks/everything/simple.ll     |   53 +
 llvm/microbenchmarks/everything/temp.ll       |   24 +
 llvm/microbenchmarks/redundantspawn/complex.c |   32 +
 .../redundantspawn/multiple_nested.c          |   21 +
 .../redundantspawn/multiple_redundant.c       |   20 +
 llvm/microbenchmarks/redundantspawn/serial.c  |   15 +
 .../redundantspawn/simple_spawn.c             |   15 +
 .../redundantspawn/single_redundant.c         |   16 +
 llvm/microbenchmarks/smallblock/conditional.c |   27 +
 .../microbenchmarks/smallblock/conditional.ll |   66 +
 .../smallblock/conditional_opt.ll             |   89 +
 .../smallblock/multiple_nested.c              |   21 +
 .../smallblock/multiple_spawn.c               |   19 +
 llvm/microbenchmarks/smallblock/serial.c      |   15 +
 .../microbenchmarks/smallblock/simple_spawn.c |   15 +
 .../spawnrestructure/base_negative.c          |   20 +
 .../spawnrestructure/base_negative.ll         |   46 +
 .../spawnrestructure/base_positive.c          |   19 +
 .../spawnrestructure/base_positive.ll         |   46 +
 .../spawnrestructure/complex.c                |   32 +
 .../spawnrestructure/multiple_nested.c        |   21 +
 .../microbenchmarks/spawnrestructure/serial.c |   15 +
 .../spawnrestructure/simple_spawn.c           |   15 +
 llvm/microbenchmarks/spawnunswitch/simple.c   |   16 +
 llvm/microbenchmarks/spawnunswitch/simple.ll  |   41 +
 llvm/microbenchmarks/spawnunswitch/simple2.c  |   14 +
 llvm/microbenchmarks/spawnunswitch/simple2.ll |   37 +
 llvm/microbenchmarks/spawnunswitch/temp.ll    |   38 +
 llvm/microbenchmarks/spawnunswitch/test.c     |   12 +
 llvm/microbenchmarks/spawnunswitch/test2.c    |   12 +
 llvm/microbenchmarks/timing/average.py        |   10 +
 llvm/microbenchmarks/timing/ratio.sh          |    1 +
 llvm/microbenchmarks/timing/simple            |  Bin 0 -> 8480 bytes
 llvm/microbenchmarks/timing/simple.c          |   16 +
 llvm/microbenchmarks/timing/spawn             |  Bin 0 -> 8480 bytes
 llvm/microbenchmarks/timing/spawn.c           |   19 +
 llvm/test/Transforms/LoopFuse/fuse.ll         |   87 +
 llvm/test/Transforms/LoopFuse/no-fuse.ll      |   78 +
 .../Tapir/SyncElimination/basic1.cpp          |    6 +
 .../Tapir/SyncElimination/basic1.ll           |   29 +
 .../Tapir/SyncElimination/basic2.cpp          |    8 +
 .../Tapir/SyncElimination/basic2.ll           |   34 +
 .../Tapir/SyncElimination/fail1.cpp           |    9 +
 .../Transforms/Tapir/SyncElimination/fail1.ll |   37 +
 .../Tapir/SyncElimination/fail2.cpp           |   10 +
 .../Transforms/Tapir/SyncElimination/fail2.ll |   42 +
 .../Transforms/Tapir/SyncElimination/for1.cpp |    8 +
 .../Transforms/Tapir/SyncElimination/for1.ll  |  112 +
 .../Transforms/Tapir/SyncElimination/for2.cpp |    8 +
 .../Transforms/Tapir/SyncElimination/for2.ll  |   78 +
 .../Tapir/dac-loopspawning-simple.ll          |   98 +
 llvm/test/Transforms/Tapir/looplimit.ll       |   96 +
 llvm/test/Transforms/Tapir/tapir-licm.ll      |   60 +
 llvm/tools/bugpoint/CMakeLists.txt            |    1 +
 llvm/tools/bugpoint/LLVMBuild.txt             |    1 +
 llvm/tools/bugpoint/bugpoint.cpp              |    1 +
 llvm/tools/clang                              |    1 +
 llvm/tools/opt/CMakeLists.txt                 |    1 +
 llvm/tools/opt/LLVMBuild.txt                  |    1 +
 llvm/tools/opt/opt.cpp                        |    1 +
 llvm/utils/emacs/llvm-mode.el                 |    6 +-
 178 files changed, 16355 insertions(+), 131 deletions(-)
 create mode 100644 llvm/.circleci/config.yml
 create mode 100644 llvm/898/sync_elimination_pfor_mb/main.c
 create mode 100644 llvm/898/sync_elimination_pfor_mb/makefile
 create mode 100644 llvm/README.md
 create mode 100644 llvm/include/llvm-c/Transforms/Tapir.h
 create mode 100644 llvm/include/llvm/Analysis/DetachSSA.h
 create mode 100644 llvm/include/llvm/Transforms/CSI.h
 create mode 100644 llvm/include/llvm/Transforms/Scalar/LoopFuse.h
 create mode 100644 llvm/include/llvm/Transforms/Tapir.h
 create mode 100644 llvm/include/llvm/Transforms/Tapir/CilkABI.h
 create mode 100644 llvm/include/llvm/Transforms/Tapir/LoopSpawning.h
 create mode 100644 llvm/include/llvm/Transforms/Tapir/Outline.h
 create mode 100644 llvm/include/llvm/Transforms/Utils/TapirUtils.h
 create mode 100644 llvm/lib/Analysis/DetachSSA.cpp
 create mode 100644 llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp
 create mode 100644 llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp
 create mode 100644 llvm/lib/Transforms/Scalar/LoopFuse.cpp
 create mode 100644 llvm/lib/Transforms/Tapir/CMakeLists.txt
 create mode 100644 llvm/lib/Transforms/Tapir/CilkABI.cpp
 create mode 100644 llvm/lib/Transforms/Tapir/LLVMBuild.txt
 create mode 100644 llvm/lib/Transforms/Tapir/LoopSpawning.cpp
 create mode 100644 llvm/lib/Transforms/Tapir/LowerToCilk.cpp
 create mode 100644 llvm/lib/Transforms/Tapir/Outline.cpp
 create mode 100644 llvm/lib/Transforms/Tapir/RedundantSpawn.cpp
 create mode 100644 llvm/lib/Transforms/Tapir/SmallBlock.cpp
 create mode 100644 llvm/lib/Transforms/Tapir/SpawnRestructure.cpp
 create mode 100644 llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp
 create mode 100644 llvm/lib/Transforms/Tapir/SyncElimination.cpp
 create mode 100644 llvm/lib/Transforms/Tapir/Tapir.cpp
 create mode 100644 llvm/lib/Transforms/Utils/TapirUtils.cpp
 create mode 100644 llvm/microbenchmarks/everything/everything.c
 create mode 100644 llvm/microbenchmarks/everything/everything.ll
 create mode 100644 llvm/microbenchmarks/everything/simple.c
 create mode 100644 llvm/microbenchmarks/everything/simple.ll
 create mode 100644 llvm/microbenchmarks/everything/temp.ll
 create mode 100644 llvm/microbenchmarks/redundantspawn/complex.c
 create mode 100644 llvm/microbenchmarks/redundantspawn/multiple_nested.c
 create mode 100644 llvm/microbenchmarks/redundantspawn/multiple_redundant.c
 create mode 100644 llvm/microbenchmarks/redundantspawn/serial.c
 create mode 100644 llvm/microbenchmarks/redundantspawn/simple_spawn.c
 create mode 100644 llvm/microbenchmarks/redundantspawn/single_redundant.c
 create mode 100644 llvm/microbenchmarks/smallblock/conditional.c
 create mode 100644 llvm/microbenchmarks/smallblock/conditional.ll
 create mode 100644 llvm/microbenchmarks/smallblock/conditional_opt.ll
 create mode 100644 llvm/microbenchmarks/smallblock/multiple_nested.c
 create mode 100644 llvm/microbenchmarks/smallblock/multiple_spawn.c
 create mode 100644 llvm/microbenchmarks/smallblock/serial.c
 create mode 100644 llvm/microbenchmarks/smallblock/simple_spawn.c
 create mode 100644 llvm/microbenchmarks/spawnrestructure/base_negative.c
 create mode 100644 llvm/microbenchmarks/spawnrestructure/base_negative.ll
 create mode 100644 llvm/microbenchmarks/spawnrestructure/base_positive.c
 create mode 100644 llvm/microbenchmarks/spawnrestructure/base_positive.ll
 create mode 100644 llvm/microbenchmarks/spawnrestructure/complex.c
 create mode 100644 llvm/microbenchmarks/spawnrestructure/multiple_nested.c
 create mode 100644 llvm/microbenchmarks/spawnrestructure/serial.c
 create mode 100644 llvm/microbenchmarks/spawnrestructure/simple_spawn.c
 create mode 100644 llvm/microbenchmarks/spawnunswitch/simple.c
 create mode 100644 llvm/microbenchmarks/spawnunswitch/simple.ll
 create mode 100644 llvm/microbenchmarks/spawnunswitch/simple2.c
 create mode 100644 llvm/microbenchmarks/spawnunswitch/simple2.ll
 create mode 100644 llvm/microbenchmarks/spawnunswitch/temp.ll
 create mode 100644 llvm/microbenchmarks/spawnunswitch/test.c
 create mode 100644 llvm/microbenchmarks/spawnunswitch/test2.c
 create mode 100644 llvm/microbenchmarks/timing/average.py
 create mode 100644 llvm/microbenchmarks/timing/ratio.sh
 create mode 100755 llvm/microbenchmarks/timing/simple
 create mode 100644 llvm/microbenchmarks/timing/simple.c
 create mode 100755 llvm/microbenchmarks/timing/spawn
 create mode 100644 llvm/microbenchmarks/timing/spawn.c
 create mode 100644 llvm/test/Transforms/LoopFuse/fuse.ll
 create mode 100644 llvm/test/Transforms/LoopFuse/no-fuse.ll
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/basic1.cpp
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/basic1.ll
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/basic2.cpp
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/basic2.ll
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/fail1.cpp
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/fail1.ll
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/fail2.cpp
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/fail2.ll
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/for1.cpp
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/for1.ll
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/for2.cpp
 create mode 100644 llvm/test/Transforms/Tapir/SyncElimination/for2.ll
 create mode 100644 llvm/test/Transforms/Tapir/dac-loopspawning-simple.ll
 create mode 100644 llvm/test/Transforms/Tapir/looplimit.ll
 create mode 100644 llvm/test/Transforms/Tapir/tapir-licm.ll
 create mode 160000 llvm/tools/clang

diff --git a/llvm/.circleci/config.yml b/llvm/.circleci/config.yml
new file mode 100644
index 00000000000000..ad6dfc714d34e1
--- /dev/null
+++ b/llvm/.circleci/config.yml
@@ -0,0 +1,30 @@
+version: 2
+jobs:
+  build:
+    resource_class: xlarge
+    docker:
+      - image: wsmoses/tapir:latest
+
+    steps:
+      - checkout
+      - run:
+          name: submodules
+          command: |
+            git submodule sync
+            git submodule update --init --recursive
+      - run:
+          name: cmake
+          command: |
+            mkdir build
+            cd build
+            cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=host -DLLVM_BUILD_TESTS=ON -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_LTO=OFF -DLLVM_PARALLEL_COMPILE_JOBS=2 -DLLVM_PARALLEL_LINK_JOBS=1
+      - run:
+          name: make
+          command: |
+            cd build
+            make -j2
+      - run:
+          name: test
+          command: |
+            cd build
+            make check-all
diff --git a/llvm/.gitignore b/llvm/.gitignore
index be58944c9b3a1a..ba3de277d5e2f7 100644
--- a/llvm/.gitignore
+++ b/llvm/.gitignore
@@ -80,3 +80,8 @@ docs/_build
 #==============================================================================#
 bindings/go/llvm/llvm_config.go
 bindings/go/llvm/workdir
+
+build/*
+build
+build-debug/*
+build-debug
diff --git a/llvm/898/sync_elimination_pfor_mb/main.c b/llvm/898/sync_elimination_pfor_mb/main.c
new file mode 100644
index 00000000000000..b5e0ebc4281ad5
--- /dev/null
+++ b/llvm/898/sync_elimination_pfor_mb/main.c
@@ -0,0 +1,35 @@
+#include <stdio.h>
+
+#include <cilk/cilk.h>
+
+#define N 100000000
+
+__attribute__((always_inline))
+int f(int x) {
+    return x * x;
+}
+
+__attribute__((always_inline))
+int g(int x) {
+    return x + 3;
+}
+
+int r1[N];
+int r2[N];
+
+int main(void)
+{
+    int sum = 0;
+
+    cilk_for (int i=0; i<N; i++) {
+        r1[i] = f(i) * g(i);
+    }
+
+    cilk_for (int i=0; i<N; i++) {
+        r2[i] = f(i) / g(i);
+    }
+
+    printf("%d %d\n", r1[N / 2], r2[N / 2]);
+
+    return 0;
+}
diff --git a/llvm/898/sync_elimination_pfor_mb/makefile b/llvm/898/sync_elimination_pfor_mb/makefile
new file mode 100644
index 00000000000000..c680a28b41a006
--- /dev/null
+++ b/llvm/898/sync_elimination_pfor_mb/makefile
@@ -0,0 +1,16 @@
+all: main_o3 main_sync
+
+clean:
+	rm -f main_o3 main_sync
+
+main_o3:
+	clang -O3 -o main_o3 main.c
+	clang -O3 -S -emit-llvm -o main_o3.ll main.c
+
+main_sync:
+	clang -S -emit-llvm -o main_sync_raw.ll main.c
+	opt -O3 main_sync_raw.ll -S -o main_sync_o3.ll
+	opt -sync-elimination main_sync_o3.ll -S -o main_sync_sync_eliminated.ll
+	opt -simplifycfg -loop-simplify -loop-fuse main_sync_sync_eliminated.ll -S -o main_sync_loop_fused.ll
+	clang -O3 -o main_sync main_sync_loop_fused.ll
+	clang -O3 -S -emit-llvm -o main_sync.ll main_sync_loop_fused.ll
diff --git a/llvm/LICENSE.TXT b/llvm/LICENSE.TXT
index e4d67d16fea1ee..e16d5ab866cd43 100644
--- a/llvm/LICENSE.TXT
+++ b/llvm/LICENSE.TXT
@@ -15,6 +15,15 @@ Developed by:
 
     http://llvm.org
 
+--------------------------
+Changes to implement Tapir
+--------------------------
+Copyright (c) 2016 William S. Moses and Tao B. Schardl.
+
+Developed by:
+
+    William S. Moses and Tao B. Schardl
+
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal with
 the Software without restriction, including without limitation the rights to
diff --git a/llvm/README.md b/llvm/README.md
new file mode 100644
index 00000000000000..a5cb3c3767ca4d
--- /dev/null
+++ b/llvm/README.md
@@ -0,0 +1,24 @@
+Tapir/LLVM
+================================
+
+This directory and its subdirectories contain source code for
+Tapir/LLVM, a prototype compiler based on LLVM that implements the
+Tapir compiler IR extensions for fork-join parallelism.
+
+Tapir/LLVM is under active development.  This directory contains
+prototype implementations of compiler technologies that take advantage
+of the Tapir compiler IR.  These prototype technologies include the
+Rhino extensions to Tapir (unpublished).
+
+Tapir/LLVM is open source software.  You may freely distribute it
+under the terms of the license agreement found in LICENSE.txt.
+
+[![CircleCI](https://circleci.com/gh/wsmoses/Parallel-IR.svg?style=svg)](https://circleci.com/gh/wsmoses/Parallel-IR)
+
+
+# References
+
+T. B. Schardl, W. S. Moses, C. E. Leiserson.  "Tapir: Embedding
+Fork-Join Parallelism into LLVM's Intermediate Representation."  ACM
+PPoPP, February 2017, pp. 249-265.  Won Best Paper Award.
+http://dl.acm.org/citation.cfm?id=3018758
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 06de058bdc58c4..259459759fd0e5 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -139,7 +139,12 @@ typedef enum {
   LLVMCatchRet       = 62,
   LLVMCatchPad       = 63,
   LLVMCleanupPad     = 64,
-  LLVMCatchSwitch    = 65
+  LLVMCatchSwitch    = 65,
+
+  /* Parallel operators */
+  LLVMDetach         = 66,
+  LLVMReattach       = 67,
+  LLVMSync           = 68
 } LLVMOpcode;
 
 typedef enum {
@@ -255,6 +260,10 @@ typedef enum {
   LLVMMemoryDefValueKind,
   LLVMMemoryPhiValueKind,
 
+  LLVMDetachUseValueKind,
+  LLVMDetachDefValueKind,
+  LLVMDetachPhiValueKind,
+
   LLVMFunctionValueKind,
   LLVMGlobalAliasValueKind,
   LLVMGlobalIFuncValueKind,
@@ -1568,8 +1577,11 @@ LLVMTypeRef LLVMX86MMXType(void);
       macro(SwitchInst)                     \
       macro(UnreachableInst)                \
       macro(ResumeInst)                     \
-      macro(CleanupReturnInst)              \
-      macro(CatchReturnInst)                \
+      macro(DetachInst)                   \
+      macro(ReattachInst)                 \
+      macro(SyncInst)                     \
+      macro(CleanupReturnInst)            \
+      macro(CatchReturnInst)              \
       macro(FuncletPadInst)                 \
         macro(CatchPadInst)                 \
         macro(CleanupPadInst)               \
@@ -3433,6 +3445,12 @@ LLVMValueRef LLVMBuildCleanupPad(LLVMBuilderRef B, LLVMValueRef ParentPad,
 LLVMValueRef LLVMBuildCatchSwitch(LLVMBuilderRef B, LLVMValueRef ParentPad,
                                   LLVMBasicBlockRef UnwindBB,
                                   unsigned NumHandlers, const char *Name);
+LLVMValueRef LLVMBuildResume(LLVMBuilderRef B, LLVMValueRef Exn);
+LLVMValueRef LLVMBuildUnreachable(LLVMBuilderRef);
+LLVMValueRef LLVMBuildDetach(LLVMBuilderRef,
+                             LLVMBasicBlockRef Child, LLVMBasicBlockRef Parent);
+LLVMValueRef LLVMBuildReattach(LLVMBuilderRef);
+LLVMValueRef LLVMBuildSync(LLVMBuilderRef, LLVMBasicBlockRef Continue);
 
 /* Add a case to the switch instruction */
 void LLVMAddCase(LLVMValueRef Switch, LLVMValueRef OnVal,
diff --git a/llvm/include/llvm-c/Initialization.h b/llvm/include/llvm-c/Initialization.h
index e45eafb139f2c0..dfb4a47a3c2f63 100644
--- a/llvm/include/llvm-c/Initialization.h
+++ b/llvm/include/llvm-c/Initialization.h
@@ -36,6 +36,7 @@ void LLVMInitializeTransformUtils(LLVMPassRegistryRef R);
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R);
 void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R);
 void LLVMInitializeVectorization(LLVMPassRegistryRef R);
+void LLVMInitializeTapirOpts(LLVMPassRegistryRef R);
 void LLVMInitializeInstCombine(LLVMPassRegistryRef R);
 void LLVMInitializeAggressiveInstCombiner(LLVMPassRegistryRef R);
 void LLVMInitializeIPO(LLVMPassRegistryRef R);
diff --git a/llvm/include/llvm-c/Transforms/Tapir.h b/llvm/include/llvm-c/Transforms/Tapir.h
new file mode 100644
index 00000000000000..2a60157172dd75
--- /dev/null
+++ b/llvm/include/llvm-c/Transforms/Tapir.h
@@ -0,0 +1,50 @@
+/*===---------------------------Tapir.h ------------------------- -*- C -*-===*\
+|*===----------- Tapir Transformation Library C Interface -----------------===*|
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to libLLVMTapirOpts.a, which          *|
+|* implements various Tapir transformations of the LLVM IR.                   *|
+|*                                                                            *|
+|* Many exotic languages can interoperate with C code but have a harder time  *|
+|* with C++ due to name mangling. So in addition to C, this interface enables *|
+|* tools written in such languages.                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_TRANSFORMS_TAPIR_H
+#define LLVM_C_TRANSFORMS_TAPIR_H
+
+#include "llvm-c/Types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup LLVMCTransformsTapir Tapir transformations
+ * @ingroup LLVMCTransforms
+ *
+ * @{
+ */
+
+/** See llvm::createLoopSpawningPass function. */
+void LLVMAddLoopSpawningPass(LLVMPassManagerRef PM);
+
+/** See llvm::createLowerTapirToCilkPass function. */
+void LLVMAddLowerTapirToCilkPass(LLVMPassManagerRef PM);
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif /* defined(__cplusplus) */
+
+#endif
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index e2a2ac0622e82d..b05f390e0915c3 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -507,6 +507,16 @@ class AAResults {
     return getModRefInfo(Call, MemoryLocation(P, Size));
   }
 
+  /// getModRefInfo (for detaches) - Return information about whether
+  /// a particular detach modifies or reads the specified memory location.
+  ModRefInfo getModRefInfo(const DetachInst *D, const MemoryLocation &Loc);
+
+  /// getModRefInfo (for detaches) - A convenience wrapper.
+  ModRefInfo getModRefInfo(const DetachInst *D, const Value *P,
+                           uint64_t Size) {
+    return getModRefInfo(D, MemoryLocation(P, Size));
+  }
+
   /// getModRefInfo (for loads) - Return information about whether
   /// a particular load modifies or reads the specified memory location.
   ModRefInfo getModRefInfo(const LoadInst *L, const MemoryLocation &Loc);
@@ -537,6 +547,15 @@ class AAResults {
     return getModRefInfo(S, MemoryLocation(P, Size));
   }
 
+  /// getModRefInfo (for syncs) - Return information about whether
+  /// a particular store modifies or reads the specified memory location.
+  ModRefInfo getModRefInfo(const SyncInst *S, const MemoryLocation &Loc);
+
+  /// getModRefInfo (for syncs) - A convenience wrapper.
+  ModRefInfo getModRefInfo(const SyncInst *S, const Value *P, uint64_t Size) {
+    return getModRefInfo(S, MemoryLocation(P, Size));
+  }
+
   /// getModRefInfo (for cmpxchges) - Return information about whether
   /// a particular cmpxchg modifies or reads the specified memory location.
   ModRefInfo getModRefInfo(const AtomicCmpXchgInst *CX,
@@ -613,6 +632,7 @@ class AAResults {
     case Instruction::Load:   return getModRefInfo((const LoadInst*)I,  Loc);
     case Instruction::Store:  return getModRefInfo((const StoreInst*)I, Loc);
     case Instruction::Fence:  return getModRefInfo((const FenceInst*)I, Loc);
+    case Instruction::Sync:   return getModRefInfo((const SyncInst*)I, Loc);
     case Instruction::AtomicCmpXchg:
       return getModRefInfo((const AtomicCmpXchgInst*)I, Loc);
     case Instruction::AtomicRMW:
@@ -623,6 +643,7 @@ class AAResults {
       return getModRefInfo((const CatchPadInst *)I, Loc);
     case Instruction::CatchRet:
       return getModRefInfo((const CatchReturnInst *)I, Loc);
+    case Instruction::Detach: return getModRefInfo((const DetachInst*)I,Loc);
     default:
       return ModRefInfo::NoModRef;
     }
diff --git a/llvm/include/llvm/Analysis/DetachSSA.h b/llvm/include/llvm/Analysis/DetachSSA.h
new file mode 100644
index 00000000000000..1611f60a7db263
--- /dev/null
+++ b/llvm/include/llvm/Analysis/DetachSSA.h
@@ -0,0 +1,827 @@
+//===- DetachSSA.h - Build Detach SSA ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file exposes an interface to building/using detach SSA to
+/// walk detach instructions using a use/def graph.
+///
+/// This analysis is heavily based on MemorySSA.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DETACHSSA_H
+#define LLVM_ANALYSIS_DETACHSSA_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedUser.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class Function;
+class Instruction;
+class DetachAccess;
+class LLVMContext;
+class raw_ostream;
+namespace DSSAHelpers {
+struct AllAccessTag {};
+struct DefsOnlyTag {};
+}
+
+enum {
+  // Used to signify what the default invalid ID is for DetachAccess's
+  // getID()
+  INVALID_DETACHACCESS_ID = 0
+};
+
+template <class T> class detachaccess_def_iterator_base;
+using detachaccess_def_iterator = detachaccess_def_iterator_base<DetachAccess>;
+using const_detachaccess_def_iterator =
+    detachaccess_def_iterator_base<const DetachAccess>;
+
+// \brief The base for all detach accesses, i.e., detaches (defs) and syncs
+// (uses).
+class DetachAccess
+  : public DerivedUser,
+    public ilist_node<DetachAccess, ilist_tag<DSSAHelpers::AllAccessTag>>,
+    public ilist_node<DetachAccess, ilist_tag<DSSAHelpers::DefsOnlyTag>> {
+public:
+  using AllAccessType =
+      ilist_node<DetachAccess, ilist_tag<DSSAHelpers::AllAccessTag>>;
+  using DefsOnlyType =
+      ilist_node<DetachAccess, ilist_tag<DSSAHelpers::DefsOnlyTag>>;
+
+  // Methods for support type inquiry through isa, cast, and
+  // dyn_cast
+  static inline bool classof(const Value *V) {
+    unsigned ID = V->getValueID();
+    return ID == DetachUseVal || ID == DetachPhiVal || ID == DetachDefVal;
+  }
+
+  DetachAccess(const DetachAccess &) = delete;
+  DetachAccess &operator=(const DetachAccess &) = delete;
+
+  void *operator new(size_t, unsigned) = delete;
+  void *operator new(size_t) = delete;
+
+  BasicBlock *getBlock() const { return Block; }
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+
+  /// \brief The user iterators for a detach access
+  typedef user_iterator iterator;
+  typedef const_user_iterator const_iterator;
+
+  /// \brief This iterator walks over all of the defs in a given
+  /// DetachAccess. For DetachPhi nodes, this walks arguments. For
+  /// DetachUse/DetachDef, this walks the defining access.
+  detachaccess_def_iterator defs_begin();
+  const_detachaccess_def_iterator defs_begin() const;
+  detachaccess_def_iterator defs_end();
+  const_detachaccess_def_iterator defs_end() const;
+
+  /// \brief Get the iterators for the all access list and the defs only list
+  /// We default to the all access list.
+  AllAccessType::self_iterator getIterator() {
+    return this->AllAccessType::getIterator();
+  }
+  AllAccessType::const_self_iterator getIterator() const {
+    return this->AllAccessType::getIterator();
+  }
+  AllAccessType::reverse_self_iterator getReverseIterator() {
+    return this->AllAccessType::getReverseIterator();
+  }
+  AllAccessType::const_reverse_self_iterator getReverseIterator() const {
+    return this->AllAccessType::getReverseIterator();
+  }
+  DefsOnlyType::self_iterator getDefsIterator() {
+    return this->DefsOnlyType::getIterator();
+  }
+  DefsOnlyType::const_self_iterator getDefsIterator() const {
+    return this->DefsOnlyType::getIterator();
+  }
+  DefsOnlyType::reverse_self_iterator getReverseDefsIterator() {
+    return this->DefsOnlyType::getReverseIterator();
+  }
+  DefsOnlyType::const_reverse_self_iterator getReverseDefsIterator() const {
+    return this->DefsOnlyType::getReverseIterator();
+  }
+
+protected:
+  friend class DetachSSA;
+  friend class DetachUseOrDef;
+  friend class DetachUse;
+  friend class DetachDef;
+  friend class DetachPhi;
+
+  /// \brief Used by DetachSSA to change the block of a DetachAccess when it is
+  /// moved.
+  void setBlock(BasicBlock *BB) { Block = BB; }
+
+  /// \brief Used for debugging and tracking things about DetachAccesses.
+  /// Guaranteed unique among DetachAccesses, no guarantees otherwise.
+  inline unsigned getID() const;
+
+  DetachAccess(LLVMContext &C, unsigned Vty, DeleteValueTy DeleteValue,
+               BasicBlock *BB, unsigned NumOperands)
+      : DerivedUser(Type::getVoidTy(C), Vty, nullptr, NumOperands, DeleteValue),
+        Block(BB) {}
+
+private:
+  BasicBlock *Block;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const DetachAccess &DA) {
+  DA.print(OS);
+  return OS;
+}
+
+/// \brief Class that has the common methods + fields of detach uses/defs. It's
+/// a little awkward to have, but there are many cases where we want either a
+/// use or def, and there are many cases where uses are needed (defs aren't
+/// acceptable), and vice-versa.
+///
+/// This class should never be instantiated directly; make a DetachUse or
+/// DetachDef instead.
+class DetachUseOrDef : public DetachAccess {
+public:
+  void *operator new(size_t, unsigned) = delete;
+  void *operator new(size_t) = delete;
+
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess);
+
+  /// \brief Get the instruction that this DetachAccess represents.
+  Instruction *getDAInst() const { return DAInst; }
+
+  /// \brief Get the access that produces the detach state used by this Use.
+  DetachAccess *getDefiningAccess() const { return getOperand(0); }
+
+  static inline bool classof(const Value *DA) {
+    return DA->getValueID() == DetachUseVal || DA->getValueID() == DetachDefVal;
+  }
+
+  // Sadly, these have to be public because they are needed in some of the
+  // iterators.
+  inline bool isOptimized() const;
+  inline DetachAccess *getOptimized() const;
+  inline void setOptimized(DetachAccess *);
+
+  /// \brief Reset the ID of what this DetachUse was optimized to, causing it to
+  /// be rewalked by the walker if necessary.
+  /// This really should only be called by tests.
+  inline void resetOptimized();
+
+protected:
+  friend class DetachSSA;
+  DetachUseOrDef(LLVMContext &C, DetachAccess *DDA, unsigned Vty,
+                 DeleteValueTy DeleteValue, Instruction *TI, BasicBlock *BB)
+      : DetachAccess(C, Vty, DeleteValue, BB, 1), DAInst(TI) {
+    setDefiningAccess(DDA);
+  }
+  void setDefiningAccess(DetachAccess *DDA, bool Optimized = false) {
+    if (!Optimized) {
+      setOperand(0, DDA);
+      return;
+    }
+    setOptimized(DDA);
+  }
+
+private:
+  Instruction *DAInst;
+};
+
+template <>
+struct OperandTraits<DetachUseOrDef>
+    : public FixedNumOperandTraits<DetachUseOrDef, 1> {};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachUseOrDef, DetachAccess)
+
+/// \brief Represents a detach use, i.e., a sync instruction.
+class DetachUse final : public DetachUseOrDef {
+public:
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess);
+
+  DetachUse(LLVMContext &C, DetachAccess *DDA, Instruction *SI, BasicBlock *BB)
+      : DetachUseOrDef(C, DDA, DetachUseVal, deleteMe, SI, BB),
+        OptimizedID(0) {}
+
+  // allocate space for exactly one operand
+  void *operator new(size_t s) { return User::operator new(s, 1); }
+  void *operator new(size_t, unsigned) = delete;
+
+  static inline bool classof(const Value *DA) {
+    return DA->getValueID() == DetachUseVal;
+  }
+
+  void print(raw_ostream &OS) const;
+
+  void setOptimized(DetachAccess *DDA) {
+    OptimizedID = DDA->getID();
+    setOperand(0, DDA);
+  }
+
+  bool isOptimized() const {
+    return getDefiningAccess() && OptimizedID == getDefiningAccess()->getID();
+  }
+
+  DetachAccess *getOptimized() const {
+    return getDefiningAccess();
+  }
+  void resetOptimized() {
+    OptimizedID = INVALID_DETACHACCESS_ID;
+  }
+
+protected:
+  friend class DetachSSA;
+
+private:
+  static void deleteMe(DerivedUser *Self);
+
+  unsigned int OptimizedID;
+};
+
+template <>
+struct OperandTraits<DetachUse> : public FixedNumOperandTraits<DetachUse, 1> {};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachUse, DetachAccess)
+
+/// \brief Represents a detach definition, i.e., a detach.
+class DetachDef final : public DetachUseOrDef {
+public:
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess);
+
+  DetachDef(LLVMContext &C, DetachAccess *DDA, Instruction *DI, BasicBlock *BB,
+            unsigned Ver)
+      : DetachUseOrDef(C, DDA, DetachDefVal, deleteMe, DI, BB),
+        ID(Ver), Optimized(nullptr), OptimizedID(INVALID_DETACHACCESS_ID) {}
+
+  // allocate space for exactly one operand
+  void *operator new(size_t s) { return User::operator new(s, 1); }
+  void *operator new(size_t, unsigned) = delete;
+
+  static inline bool classof(const Value *DA) {
+    return DA->getValueID() == DetachDefVal;
+  }
+
+  void setOptimized(DetachAccess *DA) {
+    Optimized = DA;
+    OptimizedID = getDefiningAccess()->getID();
+  }
+  DetachAccess *getOptimized() const { return Optimized; }
+  bool isOptimized() const {
+    return getOptimized() && getDefiningAccess() &&
+           OptimizedID == getDefiningAccess()->getID();
+  }
+  void resetOptimized() {
+    OptimizedID = INVALID_DETACHACCESS_ID;
+  }
+
+  void print(raw_ostream &OS) const;
+
+  friend class DetachSSA;
+
+  unsigned getID() const { return ID; }
+
+private:
+  static void deleteMe(DerivedUser *Self);
+
+  const unsigned ID;
+  DetachAccess *Optimized;
+  unsigned int OptimizedID;
+};
+
+template <>
+struct OperandTraits<DetachDef> : public FixedNumOperandTraits<DetachDef, 1> {};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachDef, DetachAccess)
+
+/// \brief Represents phi nodes for detach accesses.
+///
+/// These have the same semantics as regular phi nodes, with the exception that
+/// only one phi will ever exist in a given basic block.
+/// Guaranteeing one phi per block means guaranteeing there is only ever one
+/// valid reaching DetachDef/DetachPHI along each path to the phi node.
+/// This is ensured by not allowing disambiguation of the RHS of a DetachDef or
+/// a DetachPhi's operands.
+class DetachPhi final : public DetachAccess {
+  // allocate space for exactly zero operands
+  void *operator new(size_t s) { return User::operator new(s); }
+
+public:
+  /// Provide fast operand accessors
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess);
+
+  DetachPhi(LLVMContext &C, BasicBlock *BB, unsigned Ver, unsigned NumPreds = 0)
+      : DetachAccess(C, DetachPhiVal, deleteMe, BB, 0), ID(Ver),
+        ReservedSpace(NumPreds) {
+    allocHungoffUses(ReservedSpace);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
+  // Block iterator interface. This provides access to the list of incoming
+  // basic blocks, which parallels the list of incoming values.
+  typedef BasicBlock **block_iterator;
+  typedef BasicBlock *const *const_block_iterator;
+
+  block_iterator block_begin() {
+    auto *Ref = reinterpret_cast<Use::UserRef *>(op_begin() + ReservedSpace);
+    return reinterpret_cast<block_iterator>(Ref + 1);
+  }
+
+  const_block_iterator block_begin() const {
+    const auto *Ref =
+        reinterpret_cast<const Use::UserRef *>(op_begin() + ReservedSpace);
+    return reinterpret_cast<const_block_iterator>(Ref + 1);
+  }
+
+  block_iterator block_end() { return block_begin() + getNumOperands(); }
+
+  const_block_iterator block_end() const {
+    return block_begin() + getNumOperands();
+  }
+
+  iterator_range<block_iterator> blocks() {
+    return make_range(block_begin(), block_end());
+  }
+
+  iterator_range<const_block_iterator> blocks() const {
+    return make_range(block_begin(), block_end());
+  }
+
+  op_range incoming_values() { return operands(); }
+
+  const_op_range incoming_values() const { return operands(); }
+
+  /// \brief Return the number of incoming edges
+  unsigned getNumIncomingValues() const { return getNumOperands(); }
+
+  /// \brief Return incoming value number x
+  DetachAccess *getIncomingValue(unsigned I) const { return getOperand(I); }
+  void setIncomingValue(unsigned I, DetachAccess *V) {
+    assert(V && "PHI node got a null value!");
+    setOperand(I, V);
+  }
+  static unsigned getOperandNumForIncomingValue(unsigned I) { return I; }
+  static unsigned getIncomingValueNumForOperand(unsigned I) { return I; }
+
+  /// \brief Return incoming basic block number @p i.
+  BasicBlock *getIncomingBlock(unsigned I) const { return block_begin()[I]; }
+
+  /// \brief Return incoming basic block corresponding
+  /// to an operand of the PHI.
+  BasicBlock *getIncomingBlock(const Use &U) const {
+    assert(this == U.getUser() && "Iterator doesn't point to PHI's Uses?");
+    return getIncomingBlock(unsigned(&U - op_begin()));
+  }
+
+  /// \brief Return incoming basic block corresponding
+  /// to value use iterator.
+  BasicBlock *getIncomingBlock(DetachAccess::const_user_iterator I) const {
+    return getIncomingBlock(I.getUse());
+  }
+
+  void setIncomingBlock(unsigned I, BasicBlock *BB) {
+    assert(BB && "PHI node got a null basic block!");
+    block_begin()[I] = BB;
+  }
+
+  /// \brief Add an incoming value to the end of the PHI list
+  void addIncoming(DetachAccess *V, BasicBlock *BB) {
+    if (getNumOperands() == ReservedSpace)
+      growOperands(); // Get more space!
+    // Initialize some new operands.
+    setNumHungOffUseOperands(getNumOperands() + 1);
+    setIncomingValue(getNumOperands() - 1, V);
+    setIncomingBlock(getNumOperands() - 1, BB);
+  }
+
+  /// \brief Return the first index of the specified basic
+  /// block in the value list for this PHI.  Returns -1 if no instance.
+  int getBasicBlockIndex(const BasicBlock *BB) const {
+    for (unsigned I = 0, E = getNumOperands(); I != E; ++I)
+      if (block_begin()[I] == BB)
+        return I;
+    return -1;
+  }
+
+  Value *getIncomingValueForBlock(const BasicBlock *BB) const {
+    int Idx = getBasicBlockIndex(BB);
+    assert(Idx >= 0 && "Invalid basic block argument!");
+    return getIncomingValue(Idx);
+  }
+
+  static inline bool classof(const Value *V) {
+    return V->getValueID() == DetachPhiVal;
+  }
+
+  void print(raw_ostream &OS) const;
+
+  unsigned getID() const { return ID; }
+
+protected:
+  friend class DetachSSA;
+
+  /// \brief this is more complicated than the generic
+  /// User::allocHungoffUses, because we have to allocate Uses for the incoming
+  /// values and pointers to the incoming blocks, all in one allocation.
+  void allocHungoffUses(unsigned N) {
+    User::allocHungoffUses(N, /* IsPhi */ true);
+  }
+
+private:
+  // For debugging only
+  const unsigned ID;
+  unsigned ReservedSpace;
+
+  /// \brief This grows the operand list in response to a push_back style of
+  /// operation.  This grows the number of ops by 1.5 times.
+  void growOperands() {
+    unsigned E = getNumOperands();
+    // 2 op PHI nodes are VERY common, so reserve at least enough for that.
+    ReservedSpace = std::max(E + E / 2, 2u);
+    growHungoffUses(ReservedSpace, /* IsPhi */ true);
+  }
+
+  static void deleteMe(DerivedUser *Self);
+};
+
+inline unsigned DetachAccess::getID() const {
+  assert((isa<DetachDef>(this) || isa<DetachPhi>(this)) &&
+         "only detach defs and phis have ids");
+  if (const auto *DD = dyn_cast<DetachDef>(this))
+    return DD->getID();
+  return cast<DetachPhi>(this)->getID();
+}
+
+inline bool DetachUseOrDef::isOptimized() const {
+  if (const auto *DD = dyn_cast<DetachDef>(this))
+    return DD->isOptimized();
+  return cast<DetachUse>(this)->isOptimized();
+}
+
+inline DetachAccess *DetachUseOrDef::getOptimized() const {
+  if (const auto *DD = dyn_cast<DetachDef>(this))
+    return DD->getOptimized();
+  return cast<DetachUse>(this)->getOptimized();
+}
+
+inline void DetachUseOrDef::setOptimized(DetachAccess *DA) {
+  if (auto *DD = dyn_cast<DetachDef>(this))
+    DD->setOptimized(DA);
+  else
+    cast<DetachUse>(this)->setOptimized(DA);
+}
+
+inline void DetachUseOrDef::resetOptimized() {
+  if (auto *DD = dyn_cast<DetachDef>(this))
+    DD->resetOptimized();
+  else
+    cast<DetachUse>(this)->resetOptimized();
+}
+
+
+template <> struct OperandTraits<DetachPhi> : public HungoffOperandTraits<2> {};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachPhi, DetachAccess)
+
+
+/// \brief Encapsulates DetachSSA, including all data associated with detach
+/// accesses.
+class DetachSSA {
+public:
+  DetachSSA(Function &, DominatorTree *);
+  ~DetachSSA();
+
+  /// \brief Given a detach Mod/Ref'ing instruction, get the DetachSSA
+  /// access associated with it. If passed a basic block gets the detach phi
+  /// node that exists for that block, if there is one. Otherwise, this will get
+  /// a DetachUseOrDef.
+  DetachUseOrDef *getDetachAccess(const Instruction *) const;
+  DetachPhi *getDetachAccess(const BasicBlock *BB) const;
+
+  void dump() const;
+  void print(raw_ostream &) const;
+
+  /// \brief Return true if \p MA represents the live on entry value
+  inline bool isLiveOnEntryDef(const DetachAccess *DA) const {
+    return DA == LiveOnEntryDef.get();
+  }
+
+  inline DetachAccess *getLiveOnEntryDef() const {
+    return LiveOnEntryDef.get();
+  }
+
+  // Sadly, iplists, by default, owns and deletes pointers added to the
+  // list. It's not currently possible to have two iplists for the same type,
+  // where one owns the pointers, and one does not. This is because the traits
+  // are per-type, not per-tag.  If this ever changes, we should make the
+  // DefList an iplist.
+  using AccessList = iplist<DetachAccess, ilist_tag<DSSAHelpers::AllAccessTag>>;
+  using DefsList =
+      simple_ilist<DetachAccess, ilist_tag<DSSAHelpers::DefsOnlyTag>>;
+
+  /// \brief Return the list of MemoryAccess's for a given basic block.
+  ///
+  /// This list is not modifiable by the user.
+  const AccessList *getBlockAccesses(const BasicBlock *BB) const {
+    return getWritableBlockAccesses(BB);
+  }
+
+  /// \brief Return the list of MemoryDef's and MemoryPhi's for a given basic
+  /// block.
+  ///
+  /// This list is not modifiable by the user.
+  const DefsList *getBlockDefs(const BasicBlock *BB) const {
+    return getWritableBlockDefs(BB);
+  }
+
+  /// \brief Given two detach accesses in the same basic block, determine
+  /// whether DetachAccess \p A dominates DetachAccess \p B.
+  bool locallyDominates(const DetachAccess *A, const DetachAccess *B) const;
+
+  /// \brief Given two detach accesses in potentially different blocks,
+  /// determine whether DetachAccess \p A dominates DetachAccess \p B.
+  bool dominates(const DetachAccess *A, const DetachAccess *B) const;
+
+  /// \brief Given a DetachAccess and a Use, determine whether DetachAccess \p A
+  /// dominates Use \p B.
+  bool dominates(const DetachAccess *A, const Use &B) const;
+
+  /// \brief Verify that DetachSSA is self consistent (IE definitions dominate
+  /// all uses, uses appear in the right places).  This is used by unit tests.
+  void verifyDetachSSA() const;
+
+  /// Used in various insertion functions to specify whether we are talking
+  /// about the beginning or end of a block.
+  enum InsertionPlace { Beginning, End };
+
+protected:
+  // Used by Detach SSA annotater, dumpers, and wrapper pass
+  friend class DetachSSAAnnotatedWriter;
+  friend class DetachSSAPrinterLegacyPass;
+
+  void verifyDefUses(Function &F) const;
+  void verifyDomination(Function &F) const;
+  void verifyOrdering(Function &F) const;
+
+  AccessList *getWritableBlockAccesses(const BasicBlock *BB) const {
+    auto It = PerBlockAccesses.find(BB);
+    return It == PerBlockAccesses.end() ? nullptr : It->second.get();
+  }
+
+  DefsList *getWritableBlockDefs(const BasicBlock *BB) const {
+    auto It = PerBlockDefs.find(BB);
+    return It == PerBlockDefs.end() ? nullptr : It->second.get();
+  }
+
+  void moveTo(DetachUseOrDef *What, BasicBlock *BB, AccessList::iterator Where);
+  void moveTo(DetachUseOrDef *What, BasicBlock *BB, InsertionPlace Point);
+  // Rename the dominator tree branch rooted at BB.
+  void renamePass(BasicBlock *BB, DetachAccess *IncomingVal,
+                  SmallPtrSetImpl<BasicBlock *> &Visited) {
+    renamePass(DT->getNode(BB), IncomingVal, Visited, true, true);
+  }
+  void removeFromLookups(DetachAccess *);
+  void removeFromLists(DetachAccess *, bool ShouldDelete = true);
+  void insertIntoListsForBlock(DetachAccess *, const BasicBlock *,
+                               InsertionPlace);
+  void insertIntoListsBefore(DetachAccess *, const BasicBlock *,
+                             AccessList::iterator);
+  // DetachUseOrDef *createDefinedAccess(Instruction *, DetachAccess *);
+
+private:
+  // class CachingWalker;
+
+  // CachingWalker *getWalkerImpl();
+  void buildDetachSSA();
+
+  void verifyUseInDefs(DetachAccess *, DetachAccess *) const;
+  using AccessMap = DenseMap<const BasicBlock *, std::unique_ptr<AccessList>>;
+  using DefsMap = DenseMap<const BasicBlock *, std::unique_ptr<DefsList>>;
+
+  void
+  determineInsertionPoint(const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks);
+  void markUnreachableAsLiveOnEntry(BasicBlock *BB);
+  bool dominatesUse(const DetachAccess *, const DetachAccess *) const;
+  DetachPhi *createDetachPhi(BasicBlock *BB);
+  // DetachUseOrDef *createNewAccess(Instruction *);
+  DetachAccess *findDominatingDef(BasicBlock *, enum InsertionPlace);
+  void placePHINodes(const SmallPtrSetImpl<BasicBlock *> &,
+                     const DenseMap<const BasicBlock *, unsigned int> &);
+  DetachAccess *renameBlock(BasicBlock *, DetachAccess *, bool);
+  void renameSuccessorPhis(BasicBlock *, DetachAccess *, bool);
+  void renamePass(DomTreeNode *, DetachAccess *IncomingVal,
+                  SmallPtrSetImpl<BasicBlock *> &Visited,
+                  bool SkipVisited = false, bool RenameAllUses = false);
+  AccessList *getOrCreateAccessList(const BasicBlock *);
+  DefsList *getOrCreateDefsList(const BasicBlock *);
+  void renumberBlock(const BasicBlock *) const;
+  DominatorTree *DT;
+  Function &F;
+
+  // Detach SSA mappings
+  DenseMap<const Value *, DetachAccess *> ValueToDetachAccess;
+  // These two mappings contain the main block to access/def mappings for
+  // DetachSSA. The list contained in PerBlockAccesses really owns all the
+  // DetachAccesses.
+  // Both maps maintain the invariant that if a block is found in them, the
+  // corresponding list is not empty, and if a block is not found in them, the
+  // corresponding list is empty.
+  AccessMap PerBlockAccesses;
+  DefsMap PerBlockDefs;
+  std::unique_ptr<DetachAccess> LiveOnEntryDef;
+
+  // Domination mappings
+  // Note that the numbering is local to a block, even though the map is
+  // global.
+  mutable SmallPtrSet<const BasicBlock *, 16> BlockNumberingValid;
+  mutable DenseMap<const DetachAccess *, unsigned long> BlockNumbering;
+
+  // Memory SSA building info
+  // std::unique_ptr<CachingWalker> Walker;
+  unsigned NextID;
+};
+
+// This pass does eager building and then printing of DetachSSA. It is used by
+// the tests to be able to build, dump, and verify Detach SSA.
+class DetachSSAPrinterLegacyPass : public FunctionPass {
+public:
+  DetachSSAPrinterLegacyPass();
+
+  bool runOnFunction(Function &) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  static char ID;
+};
+
+/// An analysis that produces \c DetachSSA for a function.
+///
+class DetachSSAAnalysis : public AnalysisInfoMixin<DetachSSAAnalysis> {
+  friend AnalysisInfoMixin<DetachSSAAnalysis>;
+
+  static AnalysisKey Key;
+
+public:
+  // Wrap DetachSSA result to ensure address stability of internal DetachSSA
+  // pointers after construction.  Use a wrapper class instead of plain
+  // unique_ptr<DetachSSA> to avoid build breakage on MSVC.
+  struct Result {
+    Result(std::unique_ptr<DetachSSA> &&DSSA) : DSSA(std::move(DSSA)) {}
+    DetachSSA &getDSSA() { return *DSSA.get(); }
+
+    std::unique_ptr<DetachSSA> DSSA;
+  };
+
+  Result run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// \brief Printer pass for \c DetachSSA.
+class DetachSSAPrinterPass : public PassInfoMixin<DetachSSAPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit DetachSSAPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// \brief Verifier pass for \c DetachSSA.
+struct DetachSSAVerifierPass : PassInfoMixin<DetachSSAVerifierPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// \brief Legacy analysis pass which computes \c DetachSSA.
+class DetachSSAWrapperPass : public FunctionPass {
+public:
+  DetachSSAWrapperPass();
+
+  static char ID;
+
+  bool runOnFunction(Function &) override;
+  void releaseMemory() override;
+  DetachSSA &getDSSA() { return *DSSA; }
+  const DetachSSA &getDSSA() const { return *DSSA; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  void verifyAnalysis() const override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+private:
+  std::unique_ptr<DetachSSA> DSSA;
+};
+
+/// \brief Iterator base class used to implement const and non-const iterators
+/// over the defining accesses of a DetachAccess.
+template <class T>
+class detachaccess_def_iterator_base
+    : public iterator_facade_base<detachaccess_def_iterator_base<T>,
+                                  std::forward_iterator_tag, T, ptrdiff_t, T *,
+                                  T *> {
+  using BaseT = typename detachaccess_def_iterator_base::iterator_facade_base;
+
+public:
+  detachaccess_def_iterator_base(T *Start) : Access(Start) {}
+  detachaccess_def_iterator_base() = default;
+
+  bool operator==(const detachaccess_def_iterator_base &Other) const {
+    return Access == Other.Access && (!Access || ArgNo == Other.ArgNo);
+  }
+
+  // This is a bit ugly, but for DetachPHI's, unlike PHINodes, you can't get the
+  // block from the operand in constant time (In a PHINode, the uselist has
+  // both, so it's just subtraction). We provide it as part of the
+  // iterator to avoid callers having to linear walk to get the block.
+  // If the operation becomes constant time on DetachPHI's, this bit of
+  // abstraction breaking should be removed.
+  BasicBlock *getPhiArgBlock() const {
+    DetachPhi *DP = dyn_cast<DetachPhi>(Access);
+    assert(DP && "Tried to get phi arg block when not iterating over a PHI");
+    return DP->getIncomingBlock(ArgNo);
+  }
+  typename BaseT::iterator::pointer operator*() const {
+    assert(Access && "Tried to access past the end of our iterator");
+    // Go to the first argument for phis, and the defining access for everything
+    // else.
+    if (DetachPhi *DP = dyn_cast<DetachPhi>(Access))
+      return DP->getIncomingValue(ArgNo);
+    return cast<DetachUseOrDef>(Access)->getDefiningAccess();
+  }
+  using BaseT::operator++;
+  detachaccess_def_iterator &operator++() {
+    assert(Access && "Hit end of iterator");
+    if (DetachPhi *DP = dyn_cast<DetachPhi>(Access)) {
+      if (++ArgNo >= DP->getNumIncomingValues()) {
+        ArgNo = 0;
+        Access = nullptr;
+      }
+    } else {
+      Access = nullptr;
+    }
+    return *this;
+  }
+
+private:
+  T *Access = nullptr;
+  unsigned ArgNo = 0;
+};
+
+inline detachaccess_def_iterator DetachAccess::defs_begin() {
+  return detachaccess_def_iterator(this);
+}
+
+inline const_detachaccess_def_iterator DetachAccess::defs_begin() const {
+  return const_detachaccess_def_iterator(this);
+}
+
+inline detachaccess_def_iterator DetachAccess::defs_end() {
+  return detachaccess_def_iterator();
+}
+
+inline const_detachaccess_def_iterator DetachAccess::defs_end() const {
+  return const_detachaccess_def_iterator();
+}
+
+/// \brief GraphTraits for a DetachAccess, which walks defs in the normal case,
+/// and uses in the inverse case.
+template <> struct GraphTraits<DetachAccess *> {
+  using NodeRef = DetachAccess *;
+  using ChildIteratorType = detachaccess_def_iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+  static ChildIteratorType child_begin(NodeRef N) { return N->defs_begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return N->defs_end(); }
+};
+
+template <> struct GraphTraits<Inverse<DetachAccess *>> {
+  using NodeRef = DetachAccess *;
+  using ChildIteratorType = DetachAccess::iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+  static ChildIteratorType child_begin(NodeRef N) { return N->user_begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return N->user_end(); }
+};
+
+} // End namespace llvm
+
+#endif // LLVM_ANALYSIS_DETACHSSA_H
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index f0d11e9c16894e..c53ca11aaae3ea 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -534,6 +534,9 @@ enum FunctionCodes {
   // 54 is unused.
   FUNC_CODE_OPERAND_BUNDLE = 55, // OPERAND_BUNDLE: [tag#, value...]
   FUNC_CODE_INST_UNOP = 56,      // UNOP:       [opcode, ty, opval]
+  FUNC_CODE_INST_DETACH      = 57, // DETACH: [bb#, bb#]
+  FUNC_CODE_INST_REATTACH    = 58, // REATTACH
+  FUNC_CODE_INST_SYNC        = 59, // SYNC: [bb#]
 };
 
 enum UseListCodes {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index d1770bf6e4cead..3c9c9d4f76be62 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -282,6 +282,12 @@ class IRTranslator : public MachineFunctionPass {
 
   bool translateIndirectBr(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateDetach(const User &U, MachineIRBuilder &MIRBuilder);
+
+  bool translateReattach(const User &U, MachineIRBuilder &MIRBuilder);
+
+  bool translateSync(const User &U, MachineIRBuilder &MIRBuilder);
+
   bool translateExtractValue(const User &U, MachineIRBuilder &MIRBuilder);
 
   bool translateInsertValue(const User &U, MachineIRBuilder &MIRBuilder);
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 99eac33f742ec2..bcab517887c790 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -382,6 +382,7 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   ///
   /// Also note that this doesn't preserve any passes. To split blocks while
   /// keeping loop information consistent, use the SplitBlock utility function.
+  BasicBlock *splitBasicBlockWithTerminator(const Twine &BBName = "");
   BasicBlock *splitBasicBlock(iterator I, const Twine &BBName = "");
   BasicBlock *splitBasicBlock(Instruction *I, const Twine &BBName = "") {
     return splitBasicBlock(I->getIterator(), BBName);
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 9526d6287d2f83..8c56973b517d6c 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -245,6 +245,10 @@ class StructType : public CompositeType {
   /// Create an empty structure type.
   static StructType *get(LLVMContext &Context, bool isPacked = false);
 
+  /// Try to lookup a structure type by name, and create one if one does not
+  /// exist.
+  static StructType *getOrCreate(LLVMContext &Context, StringRef Name);
+
   /// This static method is a convenience method for creating structure types by
   /// specifying the elements as arguments. Note that this method always returns
   /// a non-packed struct, and requires at least one element type.
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index fac2ff46c4531a..d5746c86329fd4 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -979,6 +979,26 @@ class IRBuilder : public IRBuilderBase, public Inserter {
     return Insert(new UnreachableInst(Context));
   }
 
+  /// \brief Create a detach instruction, 'detach within SyncRegion, Detached,
+  // Continue'.
+  DetachInst *CreateDetach(BasicBlock *Detached, BasicBlock *Continue,
+                           Value *SyncRegion, MDNode *BranchWeights = nullptr) {
+    return Insert(addBranchMetadata(DetachInst::Create(Detached, Continue,
+                                                       SyncRegion),
+                                    BranchWeights, nullptr));
+  }
+
+  /// \brief Create a reattach instruction, 'reattach within SyncRegion,
+  /// DetachContinue'.
+  ReattachInst *CreateReattach(BasicBlock *DetachContinue, Value *SyncRegion) {
+    return Insert(ReattachInst::Create(DetachContinue, SyncRegion));
+  }
+
+  /// \brief Create a sync instruction, 'sync within SyncRegion, Continue'.
+  SyncInst *CreateSync(BasicBlock *Continue, Value *SyncRegion) {
+    return Insert(SyncInst::Create(Continue, SyncRegion));
+  }
+
   //===--------------------------------------------------------------------===//
   // Instruction creation methods: Binary Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index c5b4c6f71d7d8e..f068b39f959215 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -166,6 +166,18 @@ class InstVisitor {
   // Specific Instruction type classes... note that all of the casts are
   // necessary because we use the instruction classes as opaque types...
   //
+  RetTy visitReturnInst(ReturnInst &I)            { DELEGATE(TerminatorInst);}
+  RetTy visitBranchInst(BranchInst &I)            { DELEGATE(TerminatorInst);}
+  RetTy visitSwitchInst(SwitchInst &I)            { DELEGATE(TerminatorInst);}
+  RetTy visitIndirectBrInst(IndirectBrInst &I)    { DELEGATE(TerminatorInst);}
+  RetTy visitResumeInst(ResumeInst &I)            { DELEGATE(TerminatorInst);}
+  RetTy visitUnreachableInst(UnreachableInst &I)  { DELEGATE(TerminatorInst);}
+  RetTy visitCleanupReturnInst(CleanupReturnInst &I) { DELEGATE(TerminatorInst);}
+  RetTy visitCatchReturnInst(CatchReturnInst &I)  { DELEGATE(TerminatorInst); }
+  RetTy visitCatchSwitchInst(CatchSwitchInst &I)  { DELEGATE(TerminatorInst);}
+  RetTy visitDetachInst(DetachInst &I)            { DELEGATE(TerminatorInst);}
+  RetTy visitReattachInst(ReattachInst &I)        { DELEGATE(TerminatorInst);}
+  RetTy visitSyncInst(SyncInst &I)                { DELEGATE(TerminatorInst);}
   RetTy visitICmpInst(ICmpInst &I)                { DELEGATE(CmpInst);}
   RetTy visitFCmpInst(FCmpInst &I)                { DELEGATE(CmpInst);}
   RetTy visitAllocaInst(AllocaInst &I)            { DELEGATE(UnaryInstruction);}
diff --git a/llvm/include/llvm/IR/Instruction.def b/llvm/include/llvm/IR/Instruction.def
index 58e4e2e1d6cc50..80297e4c7ab603 100644
--- a/llvm/include/llvm/IR/Instruction.def
+++ b/llvm/include/llvm/IR/Instruction.def
@@ -135,89 +135,92 @@ HANDLE_TERM_INST  ( 7, Unreachable   , UnreachableInst)
 HANDLE_TERM_INST  ( 8, CleanupRet    , CleanupReturnInst)
 HANDLE_TERM_INST  ( 9, CatchRet      , CatchReturnInst)
 HANDLE_TERM_INST  (10, CatchSwitch   , CatchSwitchInst)
-  LAST_TERM_INST  (10)
+HANDLE_TERM_INST  (11, Detach        , DetachInst)
+HANDLE_TERM_INST  (12, Reattach      , ReattachInst)
+HANDLE_TERM_INST  (13, Sync          , SyncInst)
+  LAST_TERM_INST  (13)
 
 // Standard unary operators...
- FIRST_UNARY_INST(11)
-HANDLE_UNARY_INST(11, FNeg  , UnaryOperator)
-  LAST_UNARY_INST(11)
+ FIRST_UNARY_INST(14)
+HANDLE_UNARY_INST(14, FNeg  , UnaryOperator)
+  LAST_UNARY_INST(14)
 
 // Standard binary operators...
- FIRST_BINARY_INST(12)
-HANDLE_BINARY_INST(12, Add  , BinaryOperator)
-HANDLE_BINARY_INST(13, FAdd , BinaryOperator)
-HANDLE_BINARY_INST(14, Sub  , BinaryOperator)
-HANDLE_BINARY_INST(15, FSub , BinaryOperator)
-HANDLE_BINARY_INST(16, Mul  , BinaryOperator)
-HANDLE_BINARY_INST(17, FMul , BinaryOperator)
-HANDLE_BINARY_INST(18, UDiv , BinaryOperator)
-HANDLE_BINARY_INST(19, SDiv , BinaryOperator)
-HANDLE_BINARY_INST(20, FDiv , BinaryOperator)
-HANDLE_BINARY_INST(21, URem , BinaryOperator)
-HANDLE_BINARY_INST(22, SRem , BinaryOperator)
-HANDLE_BINARY_INST(23, FRem , BinaryOperator)
+ FIRST_BINARY_INST(14)
+HANDLE_BINARY_INST(14, Add  , BinaryOperator)
+HANDLE_BINARY_INST(15, FAdd , BinaryOperator)
+HANDLE_BINARY_INST(16, Sub  , BinaryOperator)
+HANDLE_BINARY_INST(17, FSub , BinaryOperator)
+HANDLE_BINARY_INST(18, Mul  , BinaryOperator)
+HANDLE_BINARY_INST(19, FMul , BinaryOperator)
+HANDLE_BINARY_INST(20, UDiv , BinaryOperator)
+HANDLE_BINARY_INST(21, SDiv , BinaryOperator)
+HANDLE_BINARY_INST(22, FDiv , BinaryOperator)
+HANDLE_BINARY_INST(23, URem , BinaryOperator)
+HANDLE_BINARY_INST(24, SRem , BinaryOperator)
+HANDLE_BINARY_INST(25, FRem , BinaryOperator)
 
 // Logical operators (integer operands)
-HANDLE_BINARY_INST(24, Shl  , BinaryOperator) // Shift left  (logical)
-HANDLE_BINARY_INST(25, LShr , BinaryOperator) // Shift right (logical)
-HANDLE_BINARY_INST(26, AShr , BinaryOperator) // Shift right (arithmetic)
-HANDLE_BINARY_INST(27, And  , BinaryOperator)
-HANDLE_BINARY_INST(28, Or   , BinaryOperator)
-HANDLE_BINARY_INST(29, Xor  , BinaryOperator)
-  LAST_BINARY_INST(29)
+HANDLE_BINARY_INST(26, Shl  , BinaryOperator) // Shift left  (logical)
+HANDLE_BINARY_INST(27, LShr , BinaryOperator) // Shift right (logical)
+HANDLE_BINARY_INST(28, AShr , BinaryOperator) // Shift right (arithmetic)
+HANDLE_BINARY_INST(29, And  , BinaryOperator)
+HANDLE_BINARY_INST(30, Or   , BinaryOperator)
+HANDLE_BINARY_INST(31, Xor  , BinaryOperator)
+  LAST_BINARY_INST(31)
 
 // Memory operators...
- FIRST_MEMORY_INST(30)
-HANDLE_MEMORY_INST(30, Alloca, AllocaInst)  // Stack management
-HANDLE_MEMORY_INST(31, Load  , LoadInst  )  // Memory manipulation instrs
-HANDLE_MEMORY_INST(32, Store , StoreInst )
-HANDLE_MEMORY_INST(33, GetElementPtr, GetElementPtrInst)
-HANDLE_MEMORY_INST(34, Fence , FenceInst )
-HANDLE_MEMORY_INST(35, AtomicCmpXchg , AtomicCmpXchgInst )
-HANDLE_MEMORY_INST(36, AtomicRMW , AtomicRMWInst )
-  LAST_MEMORY_INST(36)
+ FIRST_MEMORY_INST(32)
+HANDLE_MEMORY_INST(32, Alloca, AllocaInst)  // Stack management
+HANDLE_MEMORY_INST(33, Load  , LoadInst  )  // Memory manipulation instrs
+HANDLE_MEMORY_INST(34, Store , StoreInst )
+HANDLE_MEMORY_INST(35, GetElementPtr, GetElementPtrInst)
+HANDLE_MEMORY_INST(36, Fence , FenceInst )
+HANDLE_MEMORY_INST(37, AtomicCmpXchg , AtomicCmpXchgInst )
+HANDLE_MEMORY_INST(38, AtomicRMW , AtomicRMWInst )
+  LAST_MEMORY_INST(38)
 
 // Cast operators ...
 // NOTE: The order matters here because CastInst::isEliminableCastPair
 // NOTE: (see Instructions.cpp) encodes a table based on this ordering.
- FIRST_CAST_INST(37)
-HANDLE_CAST_INST(37, Trunc   , TruncInst   )  // Truncate integers
-HANDLE_CAST_INST(38, ZExt    , ZExtInst    )  // Zero extend integers
-HANDLE_CAST_INST(39, SExt    , SExtInst    )  // Sign extend integers
-HANDLE_CAST_INST(40, FPToUI  , FPToUIInst  )  // floating point -> UInt
-HANDLE_CAST_INST(41, FPToSI  , FPToSIInst  )  // floating point -> SInt
-HANDLE_CAST_INST(42, UIToFP  , UIToFPInst  )  // UInt -> floating point
-HANDLE_CAST_INST(43, SIToFP  , SIToFPInst  )  // SInt -> floating point
-HANDLE_CAST_INST(44, FPTrunc , FPTruncInst )  // Truncate floating point
-HANDLE_CAST_INST(45, FPExt   , FPExtInst   )  // Extend floating point
-HANDLE_CAST_INST(46, PtrToInt, PtrToIntInst)  // Pointer -> Integer
-HANDLE_CAST_INST(47, IntToPtr, IntToPtrInst)  // Integer -> Pointer
-HANDLE_CAST_INST(48, BitCast , BitCastInst )  // Type cast
-HANDLE_CAST_INST(49, AddrSpaceCast, AddrSpaceCastInst)  // addrspace cast
-  LAST_CAST_INST(49)
-
- FIRST_FUNCLETPAD_INST(50)
-HANDLE_FUNCLETPAD_INST(50, CleanupPad, CleanupPadInst)
-HANDLE_FUNCLETPAD_INST(51, CatchPad  , CatchPadInst)
-  LAST_FUNCLETPAD_INST(51)
+ FIRST_CAST_INST(39)
+HANDLE_CAST_INST(39, Trunc   , TruncInst   )  // Truncate integers
+HANDLE_CAST_INST(40, ZExt    , ZExtInst    )  // Zero extend integers
+HANDLE_CAST_INST(41, SExt    , SExtInst    )  // Sign extend integers
+HANDLE_CAST_INST(42, FPToUI  , FPToUIInst  )  // floating point -> UInt
+HANDLE_CAST_INST(43, FPToSI  , FPToSIInst  )  // floating point -> SInt
+HANDLE_CAST_INST(44, UIToFP  , UIToFPInst  )  // UInt -> floating point
+HANDLE_CAST_INST(45, SIToFP  , SIToFPInst  )  // SInt -> floating point
+HANDLE_CAST_INST(46, FPTrunc , FPTruncInst )  // Truncate floating point
+HANDLE_CAST_INST(47, FPExt   , FPExtInst   )  // Extend floating point
+HANDLE_CAST_INST(48, PtrToInt, PtrToIntInst)  // Pointer -> Integer
+HANDLE_CAST_INST(49, IntToPtr, IntToPtrInst)  // Integer -> Pointer
+HANDLE_CAST_INST(50, BitCast , BitCastInst )  // Type cast
+HANDLE_CAST_INST(51, AddrSpaceCast, AddrSpaceCastInst)  // addrspace cast
+  LAST_CAST_INST(51)
+
+ FIRST_FUNCLETPAD_INST(52)
+HANDLE_FUNCLETPAD_INST(52, CleanupPad, CleanupPadInst)
+HANDLE_FUNCLETPAD_INST(53, CatchPad  , CatchPadInst)
+  LAST_FUNCLETPAD_INST(53)
 
 // Other operators...
- FIRST_OTHER_INST(52)
-HANDLE_OTHER_INST(52, ICmp   , ICmpInst   )  // Integer comparison instruction
-HANDLE_OTHER_INST(53, FCmp   , FCmpInst   )  // Floating point comparison instr.
-HANDLE_OTHER_INST(54, PHI    , PHINode    )  // PHI node instruction
-HANDLE_OTHER_INST(55, Call   , CallInst   )  // Call a function
-HANDLE_OTHER_INST(56, Select , SelectInst )  // select instruction
-HANDLE_USER_INST (57, UserOp1, Instruction)  // May be used internally in a pass
-HANDLE_USER_INST (58, UserOp2, Instruction)  // Internal to passes only
-HANDLE_OTHER_INST(59, VAArg  , VAArgInst  )  // vaarg instruction
-HANDLE_OTHER_INST(60, ExtractElement, ExtractElementInst)// extract from vector
-HANDLE_OTHER_INST(61, InsertElement, InsertElementInst)  // insert into vector
-HANDLE_OTHER_INST(62, ShuffleVector, ShuffleVectorInst)  // shuffle two vectors.
-HANDLE_OTHER_INST(63, ExtractValue, ExtractValueInst)// extract from aggregate
-HANDLE_OTHER_INST(64, InsertValue, InsertValueInst)  // insert into aggregate
-HANDLE_OTHER_INST(65, LandingPad, LandingPadInst)  // Landing pad instruction.
-  LAST_OTHER_INST(65)
+ FIRST_OTHER_INST(54)
+HANDLE_OTHER_INST(54, ICmp   , ICmpInst   )  // Integer comparison instruction
+HANDLE_OTHER_INST(55, FCmp   , FCmpInst   )  // Floating point comparison instr.
+HANDLE_OTHER_INST(56, PHI    , PHINode    )  // PHI node instruction
+HANDLE_OTHER_INST(57, Call   , CallInst   )  // Call a function
+HANDLE_OTHER_INST(58, Select , SelectInst )  // select instruction
+HANDLE_USER_INST (59, UserOp1, Instruction)  // May be used internally in a pass
+HANDLE_USER_INST (60, UserOp2, Instruction)  // Internal to passes only
+HANDLE_OTHER_INST(61, VAArg  , VAArgInst  )  // vaarg instruction
+HANDLE_OTHER_INST(62, ExtractElement, ExtractElementInst)// extract from vector
+HANDLE_OTHER_INST(63, InsertElement, InsertElementInst)  // insert into vector
+HANDLE_OTHER_INST(64, ShuffleVector, ShuffleVectorInst)  // shuffle two vectors.
+HANDLE_OTHER_INST(65, ExtractValue, ExtractValueInst)// extract from aggregate
+HANDLE_OTHER_INST(66, InsertValue, InsertValueInst)  // insert into aggregate
+HANDLE_OTHER_INST(67, LandingPad, LandingPadInst)  // Landing pad instruction.
+  LAST_OTHER_INST(67)
 
 #undef  FIRST_TERM_INST
 #undef HANDLE_TERM_INST
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 5e78cb1edf02b1..f2f161730c7084 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -545,6 +545,7 @@ class Instruction : public User,
     // This list should be kept in sync with the list in mayWriteToMemory for
     // all opcodes which don't have a memory location.
     case Instruction::Fence:
+    case Instruction::Sync: // Like Instruction::Fence
     case Instruction::CatchPad:
     case Instruction::CatchRet:
     case Instruction::Call:
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 0ff8f56f213ad0..5557cb4fdae59c 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -4403,6 +4403,255 @@ class UnreachableInst : public Instruction {
   }
 };
 
+//===----------------------------------------------------------------------===//
+//                               DetachInst Class
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------------
+/// DetachInst - Detach instruction
+///
+class DetachInst : public TerminatorInst {
+  /// Ops list - The operands are ordered: Detached, Continue.
+  DetachInst(const DetachInst &DI);
+  void AssertOK();
+  // DetachInst constructors (where {D, C} are blocks and SR is a token):
+  // DetachInst(BB *D, BB *C, Value *SR)          - 'detach SR, D, C'
+  // DetachInst(BB *D, BB *C, Value *SR, Inst *I)
+  //                                        - 'detach SR, D, C', insert before I
+  // DetachInst(BB *D, BB *C, Value *SR, BB *I)
+  //                                        - 'detach SR, D, C', insert at end
+  DetachInst(BasicBlock *Detached, BasicBlock *Continue,
+             Value *SyncRegion,
+             Instruction *InsertBefore = nullptr);
+  DetachInst(BasicBlock *Detached, BasicBlock *Continue,
+             Value *SyncRegion,
+             BasicBlock *InsertAtEnd);
+protected:
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+  DetachInst *cloneImpl() const;
+
+public:
+  static DetachInst *Create(BasicBlock *Detached, BasicBlock *Continue,
+                            Value *SyncRegion,
+                            Instruction *InsertBefore = nullptr) {
+    return new(3) DetachInst(Detached, Continue, SyncRegion, InsertBefore);
+  }
+  static DetachInst *Create(BasicBlock *Detached, BasicBlock *Continue,
+                            Value *SyncRegion,
+                            BasicBlock *InsertAtEnd) {
+    return new(3) DetachInst(Detached, Continue, SyncRegion, InsertAtEnd);
+  }
+
+  /// Provide fast operand accessors
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  Value *getSyncRegion() const {
+    return Op<-3>();
+  }
+
+  void setSyncRegion(Value *SyncRegion) {
+    Op<-3>() = SyncRegion;
+  }
+
+  unsigned getNumSuccessors() const { return 2; }
+
+  BasicBlock *getSuccessor(unsigned i) const {
+    assert(i < getNumSuccessors() && "Successor # out of range for detach!");
+    return cast_or_null<BasicBlock>((&Op<-1>() - i)->get());
+  }
+
+  void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
+    assert(idx < getNumSuccessors() && "Successor # out of range for detach!");
+    *(&Op<-1>() - idx) = (Value*)NewSucc;
+  }
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static inline bool classof(const Instruction *I) {
+    return (I->getOpcode() == Instruction::Detach);
+  }
+  static inline bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
+
+  inline BasicBlock* getDetached() const { return getSuccessor(0); }
+  inline BasicBlock* getContinue() const { return getSuccessor(1); }
+private:
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
+};
+
+template <>
+struct OperandTraits<DetachInst> : public VariadicOperandTraits<DetachInst, 1> {
+};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachInst, Value)
+
+//===----------------------------------------------------------------------===//
+//                           ReattachInst Class
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------------
+/// ReattachInst - Reattach instruction.  This instruction terminates
+/// a subCFG and has no successors.  The DetachContinue field
+/// maintains the continue block after the detach instruction
+/// corresponding to this reattach.
+///
+class ReattachInst : public TerminatorInst {
+  ReattachInst(const ReattachInst &RI);
+  void AssertOK();
+  // ReattachInst constructors (where C is a block and SR is a token):
+  // ReattachInst(BB *C, Value *SR)          - 'reattach SR, C'
+  // ReattachInst(BB *C, Value *SR, Inst *I) - 'reattach SR, C', insert before I
+  // ReattachInst(BB *C, Value *SR, BB *I)   - 'reattach SR, C', insert at end
+  explicit ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion,
+                        Instruction *InsertBefore = nullptr);
+  ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion,
+               BasicBlock *InsertAtEnd);
+protected:
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+  ReattachInst *cloneImpl() const;
+
+public:
+  static ReattachInst *Create(BasicBlock *DetachContinue, Value *SyncRegion,
+                              Instruction *InsertBefore = nullptr) {
+    return new(2) ReattachInst(DetachContinue, SyncRegion, InsertBefore);
+  }
+
+  static ReattachInst *Create(BasicBlock *DetachContinue, Value *SyncRegion,
+                              BasicBlock *InsertAtEnd) {
+    return new(2) ReattachInst(DetachContinue, SyncRegion, InsertAtEnd);
+  }
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  Value *getSyncRegion() const {
+    return Op<-2>();
+  }
+
+  void setSyncRegion(Value *SyncRegion) {
+    Op<-2>() = SyncRegion;
+  }
+
+  unsigned getNumSuccessors() const { return 1; }
+
+  BasicBlock *getDetachContinue() const {
+    return cast_or_null<BasicBlock>((&Op<-1>())->get());
+  }
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static inline bool classof(const Instruction *I) {
+    return I->getOpcode() == Instruction::Reattach;
+  }
+  static inline bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
+  BasicBlock *getSuccessor(unsigned i) const {
+    assert(i < getNumSuccessors() && "Successor # out of range for reattach!");
+    return cast_or_null<BasicBlock>((&Op<-1>() - i)->get());
+  }
+  void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
+    assert(idx < getNumSuccessors() &&
+           "Successor # out of range for reattach!");
+    *(&Op<-1>() - idx) = NewSucc;
+  }
+private:
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
+};
+
+template <>
+struct OperandTraits<ReattachInst> : public VariadicOperandTraits<ReattachInst, 1> {
+};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ReattachInst, Value)
+
+//===----------------------------------------------------------------------===//
+//                           SyncInst Class
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------------
+/// SyncInst - Sync instruction.
+///
+class SyncInst : public TerminatorInst {
+  /// Ops list - A sync looks like an unconditional branch to its continuation.
+  SyncInst(const SyncInst &SI);
+  void AssertOK();
+  // SyncInst constructor (where C is a block and SR is a token):
+  // SyncInst(BB *C, Value *SR)          - 'sync SR, C'
+  // SyncInst(BB *C, Value *SR, Inst *I) - 'sync SR, C'        insert before I
+  // SyncInst(BB *C, Value *SR, BB *I)   - 'sync SR, C'        insert at end
+  explicit SyncInst(BasicBlock *Continue, Value *SyncRegion,
+                    Instruction *InsertBefore = nullptr);
+  SyncInst(BasicBlock *Continue, Value *SyncRegion,
+           BasicBlock *InsertAtEnd);
+protected:
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+  SyncInst *cloneImpl() const;
+
+public:
+  static SyncInst *Create(BasicBlock *Continue,
+                          Value *SyncRegion,
+                          Instruction *InsertBefore = nullptr) {
+    return new(2) SyncInst(Continue, SyncRegion, InsertBefore);
+  }
+  static SyncInst *Create(BasicBlock *Continue,
+                          Value *SyncRegion, BasicBlock *InsertAtEnd) {
+    return new(2) SyncInst(Continue, SyncRegion, InsertAtEnd);
+  }
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  Value *getSyncRegion() const {
+    return Op<-2>();
+  }
+
+  void setSyncRegion(Value *SyncRegion) {
+    Op<-2>() = SyncRegion;
+  }
+
+  unsigned getNumSuccessors() const { return 1; }
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static inline bool classof(const Instruction *I) {
+    return I->getOpcode() == Instruction::Sync;
+  }
+  static inline bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
+
+  BasicBlock *getSuccessor(unsigned i) const {
+    assert(i < getNumSuccessors() && "Successor # out of range for sync!");
+    return cast_or_null<BasicBlock>((&Op<-1>() - i)->get());
+  }
+  void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
+    assert(idx < getNumSuccessors() && "Successor # out of range for sync!");
+    *(&Op<-1>() - idx) = NewSucc;
+  }
+private:
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
+};
+
+template <>
+struct OperandTraits<SyncInst> : public VariadicOperandTraits<SyncInst, 1> {
+};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SyncInst, Value)
+
 //===----------------------------------------------------------------------===//
 //                                 TruncInst Class
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 64603d8ea03091..0eedd5e98f83e9 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -952,6 +952,13 @@ def int_coro_subfn_addr : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i8_ty],
                                     [IntrReadMem, IntrArgMemOnly, ReadOnly<0>,
                                      NoCapture<0>]>;
 
+///===-------------------------- Tapir Intrinsics --------------------------===//
+//
+def int_syncregion_start : Intrinsic<[llvm_token_ty], [],
+                                     [IntrArgMemOnly]>;
+
+def int_detached_rethrow : Intrinsic<[], [], [Throws]>;
+
 ///===-------------------------- Other Intrinsics --------------------------===//
 //
 def int_flt_rounds : Intrinsic<[llvm_i32_ty]>,
diff --git a/llvm/include/llvm/IR/Value.def b/llvm/include/llvm/IR/Value.def
index e2ddba0aa1596e..22ca38793f5278 100644
--- a/llvm/include/llvm/IR/Value.def
+++ b/llvm/include/llvm/IR/Value.def
@@ -103,6 +103,10 @@ HANDLE_MEMORY_VALUE(MemoryUse)
 HANDLE_MEMORY_VALUE(MemoryDef)
 HANDLE_MEMORY_VALUE(MemoryPhi)
 
+HANDLE_MEMORY_VALUE(DetachUse)
+HANDLE_MEMORY_VALUE(DetachDef)
+HANDLE_MEMORY_VALUE(DetachPhi)
+
 HANDLE_INSTRUCTION(Instruction)
 // Enum values starting at InstructionVal are used for Instructions;
 // don't add new values here!
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 037c0dbb56ecec..3843050f205dfc 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -61,6 +61,9 @@ void initializeGlobalISel(PassRegistry&);
 /// Initialize all passes linked into the CodeGen library.
 void initializeTarget(PassRegistry&);
 
+/// Initialize all passes linked into the TapirOpts library.
+void initializeTapirOpts(PassRegistry&);
+
 void initializeAAEvalLegacyPassPass(PassRegistry&);
 void initializeAAResultsWrapperPassPass(PassRegistry&);
 void initializeADCELegacyPassPass(PassRegistry&);
@@ -100,7 +103,9 @@ void initializeCallGraphViewerPass(PassRegistry&);
 void initializeCallGraphWrapperPassPass(PassRegistry&);
 void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
 void initializeCalledValuePropagationLegacyPassPass(PassRegistry &);
+void initializeCilkSanitizerPass(PassRegistry&);
 void initializeCodeGenPreparePass(PassRegistry&);
+void initializeComprehensiveStaticInstrumentationPass(PassRegistry&);
 void initializeConstantHoistingLegacyPassPass(PassRegistry&);
 void initializeConstantMergeLegacyPassPass(PassRegistry&);
 void initializeConstantPropagationPass(PassRegistry&);
@@ -119,6 +124,8 @@ void initializeDelinearizationPass(PassRegistry&);
 void initializeDemandedBitsWrapperPassPass(PassRegistry&);
 void initializeDependenceAnalysisPass(PassRegistry&);
 void initializeDependenceAnalysisWrapperPassPass(PassRegistry&);
+void initializeDetachSSAPrinterLegacyPassPass(PassRegistry&);
+void initializeDetachSSAWrapperPassPass(PassRegistry&);
 void initializeDetectDeadLanesPass(PassRegistry&);
 void initializeDivRemPairsLegacyPassPass(PassRegistry&);
 void initializeDomOnlyPrinterPass(PassRegistry&);
@@ -219,6 +226,7 @@ void initializeLoopDeletionLegacyPassPass(PassRegistry&);
 void initializeLoopDistributeLegacyPass(PassRegistry&);
 void initializeLoopExtractorPass(PassRegistry&);
 void initializeLoopGuardWideningLegacyPassPass(PassRegistry&);
+void initializeLoopFusePass(PassRegistry&);
 void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&);
 void initializeLoopInfoWrapperPassPass(PassRegistry&);
 void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&);
@@ -230,6 +238,7 @@ void initializeLoopRerollPass(PassRegistry&);
 void initializeLoopRotateLegacyPassPass(PassRegistry&);
 void initializeLoopSimplifyCFGLegacyPassPass(PassRegistry&);
 void initializeLoopSimplifyPass(PassRegistry&);
+void initializeLoopSpawningPass(PassRegistry&);
 void initializeLoopStrengthReducePass(PassRegistry&);
 void initializeLoopUnrollAndJamPass(PassRegistry&);
 void initializeLoopUnrollPass(PassRegistry&);
@@ -244,6 +253,7 @@ void initializeLowerGuardIntrinsicLegacyPassPass(PassRegistry&);
 void initializeLowerIntrinsicsPass(PassRegistry&);
 void initializeLowerInvokeLegacyPassPass(PassRegistry&);
 void initializeLowerSwitchPass(PassRegistry&);
+void initializeLowerTapirToCilkPass(PassRegistry&);
 void initializeLowerTypeTestsPass(PassRegistry&);
 void initializeMIRCanonicalizerPass(PassRegistry &);
 void initializeMIRPrintingPassPass(PassRegistry&);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 0851c2f8d265bc..8564d42e5609d3 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -52,6 +52,7 @@
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/Scalarizer.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Tapir.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -131,6 +132,7 @@ namespace {
       (void) llvm::createLoopPredicationPass();
       (void) llvm::createLoopSimplifyPass();
       (void) llvm::createLoopSimplifyCFGPass();
+      (void) llvm::createLoopSpawningPass();
       (void) llvm::createLoopStrengthReducePass();
       (void) llvm::createLoopRerollPass();
       (void) llvm::createLoopUnrollPass();
@@ -142,6 +144,7 @@ namespace {
       (void) llvm::createLowerExpectIntrinsicPass();
       (void) llvm::createLowerInvokePass();
       (void) llvm::createLowerSwitchPass();
+      (void) llvm::createLowerTapirToCilkPass(false,false);
       (void) llvm::createNaryReassociatePass();
       (void) llvm::createObjCARCAAWrapperPass();
       (void) llvm::createObjCARCAPElimPass();
@@ -221,6 +224,11 @@ namespace {
       (void) llvm::createEliminateAvailableExternallyPass();
       (void) llvm::createScalarizeMaskedMemIntrinPass();
       (void) llvm::createWarnMissedTransformationsPass();
+      (void) llvm::createSmallBlockPass();
+      (void) llvm::createRedundantSpawnPass();
+      (void) llvm::createSpawnRestructurePass();
+      (void) llvm::createSyncEliminationPass();
+      (void) llvm::createSpawnUnswitchPass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::ScalarEvolutionWrapperPass();
diff --git a/llvm/include/llvm/Transforms/CSI.h b/llvm/include/llvm/Transforms/CSI.h
new file mode 100644
index 00000000000000..a357324d013b3e
--- /dev/null
+++ b/llvm/include/llvm/Transforms/CSI.h
@@ -0,0 +1,610 @@
+//===-- CSI.h ------------------------instrumentation hooks --*- C++ -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// TODO: License
+//===----------------------------------------------------------------------===//
+//
+// This file is part of CSI, a framework that provides comprehensive static
+// instrumentation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_CSI_H
+#define LLVM_TRANSFORMS_CSI_H
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+
+static const char *const CsiRtUnitInitName = "__csirt_unit_init";
+static const char *const CsiRtUnitCtorName = "csirt.unit_ctor";
+static const char *const CsiFunctionBaseIdName = "__csi_unit_func_base_id";
+static const char *const CsiFunctionExitBaseIdName = "__csi_unit_func_exit_base_id";
+static const char *const CsiBasicBlockBaseIdName = "__csi_unit_bb_base_id";
+static const char *const CsiCallsiteBaseIdName = "__csi_unit_callsite_base_id";
+static const char *const CsiLoadBaseIdName = "__csi_unit_load_base_id";
+static const char *const CsiStoreBaseIdName = "__csi_unit_store_base_id";
+static const char *const CsiUnitFedTableName = "__csi_unit_fed_table";
+static const char *const CsiFuncIdVariablePrefix = "__csi_func_id_";
+static const char *const CsiUnitFedTableArrayName = "__csi_unit_fed_tables";
+static const char *const CsiInitCallsiteToFunctionName =
+    "__csi_init_callsite_to_function";
+static const char *const CsiDisableInstrumentationName =
+    "__csi_disable_instrumentation";
+
+static const int64_t CsiCallsiteUnknownTargetId = -1;
+// See llvm/tools/clang/lib/CodeGen/CodeGenModule.h:
+static const int CsiUnitCtorPriority = 65535;
+
+/// Maintains a mapping from CSI ID to static data for that ID.
+class ForensicTable {
+public:
+  ForensicTable() : BaseId(nullptr), IdCounter(0) {}
+  ForensicTable(Module &M, StringRef BaseIdName);
+
+  /// The number of entries in this forensic table
+  uint64_t size() const { return IdCounter; }
+
+  /// Get the local ID of the given Value.
+  uint64_t getId(const Value *V);
+
+  /// The GlobalVariable holding the base ID for this forensic table.
+  GlobalVariable *baseId() const { return BaseId; }
+
+  /// Converts a local to global ID conversion.
+  ///
+  /// This is done by using the given IRBuilder to insert a load to the base ID
+  /// global variable followed by an add of the base value and the local ID.
+  ///
+  /// \returns A Value holding the global ID corresponding to the
+  /// given local ID.
+  Value *localToGlobalId(uint64_t LocalId, IRBuilder<> &IRB) const;
+
+protected:
+  /// The GlobalVariable holding the base ID for this FED table.
+  GlobalVariable *BaseId;
+  /// Counter of local IDs used so far.
+  uint64_t IdCounter;
+  /// Map of Value to Local ID.
+  DenseMap<const Value *, uint64_t> ValueToLocalIdMap;
+};
+
+/// Maintains a mapping from CSI ID to front-end data for that ID.
+///
+/// The front-end data currently is the source location that a given
+/// CSI ID corresponds to.
+class FrontEndDataTable : public ForensicTable {
+public:
+  FrontEndDataTable() : ForensicTable() {}
+  FrontEndDataTable(Module &M, StringRef BaseIdName)
+      : ForensicTable(M, BaseIdName) {}
+
+  /// The number of entries in this FED table
+  uint64_t size() const { return LocalIdToSourceLocationMap.size(); }
+
+  /// Add the given Function to this FED table.
+  /// \returns The local ID of the Function.
+  uint64_t add(const Function &F);
+
+  /// Add the given BasicBlock to this FED table.
+  /// \returns The local ID of the BasicBlock.
+  uint64_t add(const BasicBlock &BB);
+
+  /// Add the given Instruction to this FED table.
+  /// \returns The local ID of the Instruction.
+  uint64_t add(const Instruction &I);
+
+  /// Get the Type for a pointer to a FED table entry.
+  ///
+  /// A FED table entry is just a source location.
+  static PointerType *getPointerType(LLVMContext &C);
+
+  /// Insert this FED table into the given Module.
+  ///
+  /// The FED table is constructed as a ConstantArray indexed by local
+  /// IDs.  The runtime is responsible for performing the mapping that
+  /// allows the table to be indexed by global ID.
+  Constant *insertIntoModule(Module &M) const;
+
+private:
+  struct SourceLocation {
+    StringRef Name;
+    int32_t Line;
+    int32_t Column;
+    StringRef Filename;
+    StringRef Directory;
+  };
+
+  /// Map of local ID to SourceLocation.
+  DenseMap<uint64_t, SourceLocation> LocalIdToSourceLocationMap;
+
+  /// Create a struct type to match the "struct SourceLocation" type.
+  /// (and the source_loc_t type in csi.h).
+  static StructType *getSourceLocStructType(LLVMContext &C);
+
+  /// Append the debug information to the table, assigning it the next
+  /// available ID.
+  ///
+  /// \returns The local ID of the appended information.
+  /// @{
+  void add(uint64_t ID, const DILocation *Loc);
+  void add(uint64_t ID, const DISubprogram *Subprog);
+  /// @}
+
+  /// Append the line and file information to the table, assigning it
+  /// the next available ID.
+  ///
+  /// \returns The new local ID of the DILocation.
+  void add(uint64_t ID, int32_t Line = -1, int32_t Column = -1,
+           StringRef Filename = "", StringRef Directory = "",
+           StringRef Name = "");
+};
+
+/// Represents a property value passed to hooks.
+class CsiProperty {
+public:
+  CsiProperty() {}
+
+  /// Return the coerced type of a property.
+  ///
+  /// TODO: Right now, this function simply returns a 64-bit integer.  Although
+  /// this solution works for x86_64, it should be generalized to handle other
+  /// architectures in the future.
+  static Type *getCoercedType(LLVMContext &C, StructType *Ty) {
+    // Must match the definition of property type in csi.h
+    // return StructType::get(IntegerType::get(C, 64),
+    //                        nullptr);
+    // We return an integer type, rather than a struct type, to deal with x86_64
+    // type coercion on struct bit fields.
+    return IntegerType::get(C, 64);
+  }
+
+  /// Return a constant value holding this property.
+  virtual Constant *getValueImpl(LLVMContext &C) const = 0;
+
+  Constant *getValue(IRBuilder<> &IRB) const {
+    return getValueImpl(IRB.getContext());
+  }
+};
+
+class CsiFuncProperty : public CsiProperty {
+public:
+  CsiFuncProperty() {
+    PropValue.Bits = 0;
+  }
+
+  /// Return the Type of a property.
+  static Type *getType(LLVMContext &C) {
+    // Must match the definition of property type in csi.h
+    return CsiProperty::getCoercedType(
+        C, StructType::get(IntegerType::get(C, PropBits.MaySpawn),
+                           IntegerType::get(C, PropBits.Padding)));
+  }
+  /// Return a constant value holding this property.
+  Constant *getValueImpl(LLVMContext &C) const override {
+    // Must match the definition of property type in csi.h
+    // StructType *StructTy = getType(C);
+    // return ConstantStruct::get(StructTy,
+    //                            ConstantInt::get(IntegerType::get(C, 64), 0),
+    //                            nullptr);
+    // TODO: This solution works for x86, but should be generalized to support
+    // other architectures in the future.
+    return ConstantInt::get(getType(C), PropValue.Bits);
+  }
+
+  /// Set the value of the MightDetach property.
+  void setMaySpawn(bool v) {
+    PropValue.Fields.MaySpawn = v;
+  }
+
+private:
+  typedef union {
+    // Must match the definition of property type in csi.h
+    struct {
+      unsigned MaySpawn : 1;
+      uint64_t Padding : 63;
+    } Fields;
+    uint64_t Bits;
+  } Property;
+
+  /// The underlying values of the properties.
+  Property PropValue;
+
+  typedef struct {
+    int MaySpawn;
+    int Padding;
+  } PropertyBits;
+
+  /// The number of bits representing each property.
+  static constexpr PropertyBits PropBits = { 1, (64-1) };
+};
+
+class CsiFuncExitProperty : public CsiProperty {
+public:
+  CsiFuncExitProperty() {
+      PropValue.Bits = 0;
+  }
+
+  /// Return the Type of a property.
+  static Type *getType(LLVMContext &C) {
+    // Must match the definition of property type in csi.h
+    return CsiProperty::getCoercedType(
+        C, StructType::get(IntegerType::get(C, PropBits.MaySpawn),
+                           IntegerType::get(C, PropBits.Padding)));
+  }
+  /// Return a constant value holding this property.
+  Constant *getValueImpl(LLVMContext &C) const override {
+    // Must match the definition of property type in csi.h
+    // StructType *StructTy = getType(C);
+    // return ConstantStruct::get(StructTy,
+    //                            ConstantInt::get(IntegerType::get(C, 64), 0),
+    //                            nullptr);
+    // TODO: This solution works for x86, but should be generalized to support
+    // other architectures in the future.
+    return ConstantInt::get(getType(C), PropValue.Bits);
+  }
+
+  /// Set the value of the MightDetach property.
+  void setMaySpawn(bool v) {
+    PropValue.Fields.MaySpawn = v;
+  }
+
+private:
+  typedef union {
+    // Must match the definition of property type in csi.h
+    struct {
+      unsigned MaySpawn : 1;
+      uint64_t Padding : 63;
+    } Fields;
+    uint64_t Bits;
+  } Property;
+
+  /// The underlying values of the properties.
+  Property PropValue;
+
+  typedef struct {
+    int MaySpawn;
+    int Padding;
+  } PropertyBits;
+
+  /// The number of bits representing each property.
+  static constexpr PropertyBits PropBits = { 1, (64-1) };
+};
+
+class CsiBBProperty : public CsiProperty {
+public:
+  CsiBBProperty() {
+    PropValue.Bits = 0;
+  }
+
+  /// Return the Type of a property.
+  static Type *getType(LLVMContext &C) {
+    // Must match the definition of property type in csi.h
+    return CsiProperty::getCoercedType(
+        C, StructType::get(IntegerType::get(C, PropBits.IsLandingPad),
+                           IntegerType::get(C, PropBits.IsEHPad),
+                           IntegerType::get(C, PropBits.Padding)));
+  }
+
+  /// Return a constant value holding this property.
+  Constant *getValueImpl(LLVMContext &C) const override {
+    // Must match the definition of property type in csi.h
+    // StructType *StructTy = getType(C);
+    // return ConstantStruct::get(StructTy,
+    //                            ConstantInt::get(IntegerType::get(C, 64), 0),
+    //                            nullptr);
+    // TODO: This solution works for x86, but should be generalized to support
+    // other architectures in the future.
+    return ConstantInt::get(getType(C), PropValue.Bits);
+  }
+
+  /// Set the value of the IsLandingPad property.
+  void setIsLandingPad(bool v) {
+    PropValue.Fields.IsLandingPad = v;
+  }
+
+  /// Set the value of the IsEHPad property.
+  void setIsEHPad(bool v) {
+    PropValue.Fields.IsEHPad = v;
+  }
+
+private:
+  typedef union {
+    // Must match the definition of property type in csi.h
+    struct {
+      unsigned IsLandingPad : 1;
+      unsigned IsEHPad : 1;
+      uint64_t Padding : 62;
+    } Fields;
+    uint64_t Bits;
+  } Property;
+
+  /// The underlying values of the properties.
+  Property PropValue;
+
+  typedef struct {
+    int IsLandingPad;
+    int IsEHPad;
+    int Padding;
+  } PropertyBits;
+
+  /// The number of bits representing each property.
+  static constexpr PropertyBits PropBits = { 1, 1, (64-1-1) };
+};
+
+class CsiCallProperty : public CsiProperty {
+public:
+  CsiCallProperty() {
+    PropValue.Bits = 0;
+  }
+
+  /// Return the Type of a property.
+  static Type *getType(LLVMContext &C) {
+    // Must match the definition of property type in csi.h
+    return CsiProperty::getCoercedType(
+        C, StructType::get(IntegerType::get(C, PropBits.IsIndirect),
+                           IntegerType::get(C, PropBits.Padding)));
+  }
+  /// Return a constant value holding this property.
+  Constant *getValueImpl(LLVMContext &C) const override {
+    // Must match the definition of property type in csi.h
+    // StructType *StructTy = getType(C);
+    // return ConstantStruct::get(
+    //     StructTy,
+    //     ConstantInt::get(IntegerType::get(C, PropBits.IsIndirect),
+    //                      PropValue.IsIndirect),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.Padding), 0),
+    //     nullptr);
+    // TODO: This solution works for x86, but should be generalized to support
+    // other architectures in the future.
+    return ConstantInt::get(getType(C), PropValue.Bits);
+  }
+
+  /// Set the value of the IsIndirect property.
+  void setIsIndirect(bool v) {
+    PropValue.Fields.IsIndirect = v;
+  }
+
+private:
+  typedef union {
+    // Must match the definition of property type in csi.h
+    struct {
+      unsigned IsIndirect : 1;
+      uint64_t Padding : 63;
+    } Fields;
+    uint64_t Bits;
+  } Property;
+
+  /// The underlying values of the properties.
+  Property PropValue;
+
+  typedef struct {
+    int IsIndirect;
+    int Padding;
+  } PropertyBits;
+
+  /// The number of bits representing each property.
+  static constexpr PropertyBits PropBits = { 1, (64-1) };
+};
+
+class CsiLoadStoreProperty : public CsiProperty {
+public:
+  CsiLoadStoreProperty() {
+    PropValue.Bits = 0;
+  }
+  /// Return the Type of a property.
+  static Type *getType(LLVMContext &C) {
+    // Must match the definition of property type in csi.h
+    return CsiProperty::getCoercedType(
+        C, StructType::get(IntegerType::get(C, PropBits.Alignment),
+                           IntegerType::get(C, PropBits.IsVtableAccess),
+                           IntegerType::get(C, PropBits.IsConstant),
+                           IntegerType::get(C, PropBits.IsOnStack),
+                           IntegerType::get(C, PropBits.MayBeCaptured),
+                           IntegerType::get(C, PropBits.LoadReadBeforeWriteInBB),
+                           IntegerType::get(C, PropBits.Padding)));
+  }
+  /// Return a constant value holding this property.
+  Constant *getValueImpl(LLVMContext &C) const override {
+    // Must match the definition of property type in csi.h
+    // return ConstantStruct::get(
+    //     StructTy,
+    //     ConstantInt::get(IntegerType::get(C, PropBits.Alignment),
+    //                      PropValue.Alignment),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.IsVtableAccess),
+    //                      PropValue.IsVtableAccess),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.IsConstant),
+    //                      PropValue.IsVtableAccess),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.IsOnStack),
+    //                      PropValue.IsVtableAccess),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.MayBeCaptured),
+    //                      PropValue.IsVtableAccess),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.LoadReadBeforeWriteInBB),
+    //                      PropValue.LoadReadBeforeWriteInBB),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.Padding), 0),
+    //     nullptr);
+    return ConstantInt::get(getType(C), PropValue.Bits);
+  }
+
+  /// Set the value of the Alignment property.
+  void setAlignment(char v) {
+    PropValue.Fields.Alignment = v;
+  }
+  /// Set the value of the IsVtableAccess property.
+  void setIsVtableAccess(bool v) {
+    PropValue.Fields.IsVtableAccess = v;
+  }
+  /// Set the value of the IsConstant property.
+  void setIsConstant(bool v) {
+    PropValue.Fields.IsConstant = v;
+  }
+  /// Set the value of the IsOnStack property.
+  void setIsOnStack(bool v) {
+    PropValue.Fields.IsOnStack = v;
+  }
+  /// Set the value of the MayBeCaptured property.
+  void setMayBeCaptured(bool v) {
+    PropValue.Fields.MayBeCaptured = v;
+  }
+  /// Set the value of the LoadReadBeforeWriteInBB property.
+  void setLoadReadBeforeWriteInBB(bool v) {
+    PropValue.Fields.LoadReadBeforeWriteInBB = v;
+  }
+
+private:
+  typedef union {
+    // Must match the definition of property type in csi.h
+    struct {
+      unsigned Alignment : 8;
+      unsigned IsVtableAccess : 1;
+      unsigned IsConstant : 1;
+      unsigned IsOnStack : 1;
+      unsigned MayBeCaptured : 1;
+      unsigned LoadReadBeforeWriteInBB : 1;
+      uint64_t Padding : 53;
+    } Fields;
+    uint64_t Bits;
+  } Property;
+
+  /// The underlying values of the properties.
+  Property PropValue;
+
+  typedef struct {
+    int Alignment;
+    int IsVtableAccess;
+    int IsConstant;
+    int IsOnStack;
+    int MayBeCaptured;
+    int LoadReadBeforeWriteInBB;
+    int Padding;
+  } PropertyBits;
+
+  /// The number of bits representing each property.
+  static constexpr PropertyBits PropBits = { 8, 1, 1, 1, 1, 1, (64-8-1-1-1-1-1) };
+};
+
+struct CSIImpl {
+public:
+  CSIImpl(Module &M, CallGraph *CG,
+          const CSIOptions &Options = CSIOptions())
+      : M(M), DL(M.getDataLayout()), CG(CG), Options(Options),
+        CsiFuncEntry(nullptr), CsiFuncExit(nullptr), CsiBBEntry(nullptr),
+        CsiBBExit(nullptr), CsiBeforeCallsite(nullptr),
+        CsiAfterCallsite(nullptr), CsiBeforeRead(nullptr),
+        CsiAfterRead(nullptr), CsiBeforeWrite(nullptr), CsiAfterWrite(nullptr),
+        MemmoveFn(nullptr), MemcpyFn(nullptr), MemsetFn(nullptr),
+        InitCallsiteToFunction(nullptr), RTUnitInit(nullptr)
+  {}
+
+  bool run();
+
+  /// Get the number of bytes accessed via the given address.
+  static int getNumBytesAccessed(Value *Addr, const DataLayout &DL);
+
+  /// Members to extract properties of loads/stores.
+  static bool isVtableAccess(Instruction *I);
+  static bool addrPointsToConstantData(Value *Addr);
+  static bool isAtomic(Instruction *I);
+
+protected:
+  /// Initialize the CSI pass.
+  void initializeCsi();
+  /// Finalize the CSI pass.
+  void finalizeCsi();
+
+  /// Initialize llvm::Functions for the CSI hooks.
+  /// @{
+  void initializeLoadStoreHooks();
+  void initializeFuncHooks();
+  void initializeBasicBlockHooks();
+  void initializeCallsiteHooks();
+  void initializeMemIntrinsicsHooks();
+  /// @}
+
+  static StructType *getUnitFedTableType(LLVMContext &C,
+                                         PointerType *EntryPointerType);
+  static Constant *fedTableToUnitFedTable(Module &M,
+                                          StructType *UnitFedTableType,
+                                          FrontEndDataTable &FedTable);
+  /// Initialize the front-end data table structures.
+  void initializeFEDTables();
+  /// Collect unit front-end data table structures for finalization.
+  void collectUnitFEDTables();
+
+  virtual CallInst *createRTUnitInitCall(IRBuilder<> &IRB);
+
+  // Get the local ID of the given function.
+  uint64_t getLocalFunctionID(Function &F);
+  /// Generate a function that stores global function IDs into a set
+  /// of externally-visible global variables.
+  void generateInitCallsiteToFunction();
+
+  /// Compute CSI properties on the given ordered list of loads and stores.
+  void computeLoadAndStoreProperties(
+      SmallVectorImpl<std::pair<Instruction *, CsiLoadStoreProperty>>
+      &LoadAndStoreProperties,
+      SmallVectorImpl<Instruction *> &BBLoadsAndStores,
+      const DataLayout &DL);
+
+  /// Insert calls to the instrumentation hooks.
+  /// @{
+  void addLoadStoreInstrumentation(Instruction *I, Function *BeforeFn,
+                                   Function *AfterFn, Value *CsiId,
+                                   Type *AddrType, Value *Addr, int NumBytes,
+                                   CsiLoadStoreProperty &Prop);
+  void instrumentLoadOrStore(Instruction *I, CsiLoadStoreProperty &Prop,
+                             const DataLayout &DL);
+  void instrumentAtomic(Instruction *I, const DataLayout &DL);
+  bool instrumentMemIntrinsic(Instruction *I);
+  void instrumentCallsite(Instruction *I);
+  void instrumentBasicBlock(BasicBlock &BB);
+  void instrumentFunction(Function &F);
+  /// @}
+
+  /// Insert a conditional call to the given hook function before the
+  /// given instruction. The condition is based on the value of
+  /// __csi_disable_instrumentation.
+  void insertConditionalHookCall(Instruction *I, Function *HookFunction,
+                                 ArrayRef<Value *> HookArgs);
+
+  /// Return true if the given function should not be instrumented.
+  bool shouldNotInstrumentFunction(Function &F);
+
+  Module &M;
+  const DataLayout &DL;
+  CallGraph *CG;
+  CSIOptions Options;
+
+  FrontEndDataTable FunctionFED, FunctionExitFED, BasicBlockFED, CallsiteFED,
+      LoadFED, StoreFED;
+
+  SmallVector<Constant *, 6> UnitFedTables;
+
+  // Instrumentation hooks
+  Function *CsiFuncEntry, *CsiFuncExit;
+  Function *CsiBBEntry, *CsiBBExit;
+  Function *CsiBeforeCallsite, *CsiAfterCallsite;
+  Function *CsiBeforeRead, *CsiAfterRead;
+  Function *CsiBeforeWrite, *CsiAfterWrite;
+
+  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
+  Function *InitCallsiteToFunction;
+  // GlobalVariable *DisableInstrGV;
+
+  // Runtime unit initialization
+  Function *RTUnitInit;
+
+  Type *IntptrTy;
+  DenseMap<StringRef, uint64_t> FuncOffsetMap;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_CSI_H
diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 276306f686ffac..34170aff4f44ff 100644
--- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -113,8 +113,15 @@ class PassManagerBuilder {
     /// passes at the end of the main CallGraphSCC passes and before any
     /// function simplification passes run by CGPassManager.
     EP_CGSCCOptimizerLate,
+
+    /// EP_TapirLate - This extension point allows adding passes just before
+    /// Tapir instructions are lowered to calls into a parallel runtime system.
+    EP_TapirLate,
   };
 
+  /// Whether the Cilk Calls should be instrumented
+  bool InstrumentCilk;
+
   /// The Optimization Level - Specify the basic optimization level.
   ///    0 = -O0, 1 = -O1, 2 = -O2, 3 = -O3
   unsigned OptLevel;
@@ -123,6 +130,12 @@ class PassManagerBuilder {
   ///    0 = none, 1 = -Os, 2 = -Oz
   unsigned SizeLevel;
 
+  /// The Pre-lowering to parallel runtime calls optimization level
+  ///    0 = -P0 = leave with detach instructions, 1 = no optimizations before conversion, 2 = optimize before conversion
+  unsigned ParallelLevel;
+
+  bool Rhino;
+
   /// LibraryInfo - Specifies information about the runtime library for the
   /// optimizer.  If this is non-null, it is added to both the function and
   /// per-module pass pipeline.
@@ -189,6 +202,7 @@ class PassManagerBuilder {
   void addPGOInstrPasses(legacy::PassManagerBase &MPM);
   void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM);
   void addInstructionCombiningPass(legacy::PassManagerBase &MPM) const;
+  void prepopulateModulePassManager(legacy::PassManagerBase &MPM);
 
 public:
   /// populateFunctionPassManager - This fills in the function pass manager,
diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h
index 017cab0a7750df..78dca4e1ef0ffd 100644
--- a/llvm/include/llvm/Transforms/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation.h
@@ -203,7 +203,26 @@ struct SanitizerCoverageOptions {
 ModulePass *createSanitizerCoverageModulePass(
     const SanitizerCoverageOptions &Options = SanitizerCoverageOptions());
 
-/// Calculate what to divide by to scale counts.
+// Insert CilkSanitizer (Cilk determinacy race detection) instrumentation
+ModulePass *createCilkSanitizerPass();
+
+// Options for comprehensive static instrumentation
+struct CSIOptions {
+  bool InstrumentFuncEntryExit = true;
+  bool InstrumentBasicBlocks = true;
+  bool InstrumentMemoryAccesses = true;
+  bool InstrumentCalls = true;
+  bool InstrumentAtomics = true;
+  bool InstrumentMemIntrinsics = true;
+
+  CSIOptions() = default;
+};
+
+// Insert ComprehensiveStaticInstrumentation instrumentation
+ModulePass *createComprehensiveStaticInstrumentationPass(
+    const CSIOptions &Options = CSIOptions());
+
+/// \brief Calculate what to divide by to scale counts.
 ///
 /// Given the maximum count, calculate a divisor that will scale all the
 /// weights to strictly less than std::numeric_limits<uint32_t>::max().
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index 8fcf9296ba47c6..1808ba38ae7e8a 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -451,6 +451,12 @@ FunctionPass *createNaryReassociatePass();
 //
 FunctionPass *createLoopDistributePass();
 
+//===----------------------------------------------------------------------===//
+//
+// LoopFuse - Fuse loops.
+//
+FunctionPass *createLoopFusePass();
+
 //===----------------------------------------------------------------------===//
 //
 // LoopLoadElimination - Perform loop-aware load elimination.
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopFuse.h b/llvm/include/llvm/Transforms/Scalar/LoopFuse.h
new file mode 100644
index 00000000000000..5b7011e3b432a5
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/LoopFuse.h
@@ -0,0 +1,130 @@
+//===------------- LoopFuse.h - Loop Fusion Utility -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Fuse two adjacent loops to improve cache locality.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <list>
+
+namespace llvm {
+/// \brief The pass class.
+class LoopFuse : public FunctionPass {
+
+public:
+  // Kind of fusion made.
+  enum Kind {
+    NO_FUSION = 0,   // Fusion was not made even to check dependence legality.
+                     // This is when loops had failed basic structure checks.
+    REVERTED_FUSION, // Fusion was reverted due to failed dependence legality.
+    PURE_FUSION,     // Fusion succeeded with removal of original loops.
+    VERSIONED_FUSION // Fusion succeeded with versioning due to runtime checks.
+  };
+
+private:
+  // Analyses used.
+  LoopInfo *LI;
+  LoopAccessLegacyAnalysis *LAA;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+
+  // FusionSwitcher - Branch instruction that controls switching between
+  // original and fused versions. This gets initialized to true when loops are
+  // multiversioned to check fusion legality. By default, it points to original
+  // version.
+  BranchInst *FusionSwitcher;
+
+  Loop *FusedLoop;
+
+  // LAI for FusedLoop.
+  const LoopAccessInfo *LAI;
+
+  // Kind of fusion that happened.
+  Kind FusionKind = NO_FUSION;
+
+  // CustomVMap: VMap of BBs for fused loop. The problem about having
+  // ValueToValueMapTy passed from a client is that it gets updated when the
+  // loops are removed based on fusion success and this is undesirable. Also
+  // a ValueToValueMapTy is used when both Values are present. So, only a
+  // normal llvm::Value* is maintained as map's value in contrast with
+  // ValueToValueMapTy's WeakVH. Clients can use this mapping as a VMap.
+  typedef std::map<const Value *, Value *> CustomVMap;
+  CustomVMap VMap;
+
+  // Rewrite IncomingBlocks in PHIs of @Br's successor blocks from Br's parent
+  // to @To.
+  void RewritePHI(BranchInst *Br, BasicBlock *To);
+
+  // Fuse loops - @L1 and @L2 and return the fused loop.
+  Loop *FuseLoops(Loop &L1, Loop &L2);
+
+  // Legality and profitability checks.
+  bool DependenceLegal(Loop &L1, Loop &L2);
+  bool DefsUsedAcrossLoops(Loop &L1, Loop &L2);
+  bool IsLegalAndProfitable(Loop &L1, Loop &L2);
+
+  // Removal routines based on fusion success.
+  void RemoveLoopCompletelyWithPreheader(Loop &L);
+  void RemoveFusionSwitcher(Loop &L);
+
+  // Outside use updates.
+  void UpdateUsesOutsideLoop(Loop &L);
+  void AddPHIsOutsideLoop(Loop &L, BasicBlock *OrigIncomingBlock);
+
+public:
+  LoopFuse() : FunctionPass(ID) {
+    initializeLoopFusePass(*PassRegistry::getPassRegistry());
+  }
+
+  // Initialization interface when this pass is used as a utility.
+  LoopFuse(LoopInfo *_LI, LoopAccessLegacyAnalysis *_LAA, DominatorTree *_DT,
+           ScalarEvolution *_SE)
+      : FunctionPass(ID), LI(_LI), LAA(_LAA), DT(_DT), SE(_SE) {}
+
+  Loop *getFusedLoop() { return FusedLoop; }
+
+  const CustomVMap &getVMap() { return VMap; }
+
+  unsigned getFusionKind() { return FusionKind; }
+
+  // Interface; when this pass is used as a utility.
+  bool run(Loop &L1, Loop &L2);
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  static char ID;
+};
+} // anonymous namespace
diff --git a/llvm/include/llvm/Transforms/Scalar/SROA.h b/llvm/include/llvm/Transforms/Scalar/SROA.h
index b36c6f492be12a..fcd43fad841f27 100644
--- a/llvm/include/llvm/Transforms/Scalar/SROA.h
+++ b/llvm/include/llvm/Transforms/Scalar/SROA.h
@@ -64,6 +64,7 @@ class SROALegacyPass;
 ///    this form. By doing so, it will enable promotion of vector aggregates to
 ///    SSA vector values.
 class SROA : public PassInfoMixin<SROA> {
+  bool FunctionContainsDetach = false;
   LLVMContext *C = nullptr;
   DominatorTree *DT = nullptr;
   AssumptionCache *AC = nullptr;
diff --git a/llvm/include/llvm/Transforms/Tapir.h b/llvm/include/llvm/Transforms/Tapir.h
new file mode 100644
index 00000000000000..96626c283bf40c
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Tapir.h
@@ -0,0 +1,68 @@
+//===-- Tapir.h - Tapir Transformations -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for accessor functions that expose passes
+// in the Tapir transformations library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_TAPIR_H
+#define LLVM_TRANSFORMS_TAPIR_H
+
+namespace llvm {
+class Pass;
+class ModulePass;
+class FunctionPass;
+
+//===----------------------------------------------------------------------===//
+//
+// LoopSpawning - Create a loop spawning pass.
+//
+Pass *createLoopSpawningPass();
+
+//===----------------------------------------------------------------------===//
+//
+// SmallBlock - Do SmallBlock Pass
+//
+FunctionPass *createSmallBlockPass();
+
+//===----------------------------------------------------------------------===//
+//
+// SyncElimination - TODO
+//
+FunctionPass *createSyncEliminationPass();
+
+//===----------------------------------------------------------------------===//
+//
+// RedundantSpawn - Do RedundantSpawn Pass
+//
+FunctionPass *createRedundantSpawnPass();
+
+//===----------------------------------------------------------------------===//
+//
+// SpawnRestructure - Do SpawnRestructure Pass
+//
+FunctionPass *createSpawnRestructurePass();
+
+//===----------------------------------------------------------------------===//
+//
+// SpawnUnswitch - Do SpawnUnswitch Pass
+//
+FunctionPass *createSpawnUnswitchPass();
+
+//===----------------------------------------------------------------------===//
+//
+// PromoteDetachToCilk
+//
+ModulePass *createLowerTapirToCilkPass(bool DisablePostOpts = false,
+                                       bool Instrument = false);
+
+} // End llvm namespace
+
+#endif
diff --git a/llvm/include/llvm/Transforms/Tapir/CilkABI.h b/llvm/include/llvm/Transforms/Tapir/CilkABI.h
new file mode 100644
index 00000000000000..6c6bd7f4b21f51
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Tapir/CilkABI.h
@@ -0,0 +1,368 @@
+//===- CilkABI.h - Interface to the Intel Cilk Plus runtime ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is a simple pass wrapper around the PromoteMemToReg function call
+// exposed by the Utils library.
+//
+//===----------------------------------------------------------------------===//
+#ifndef CILK_ABI_H_
+#define CILK_ABI_H_
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <deque>
+
+extern llvm::cl::opt<bool> fastCilk;
+
+namespace {
+
+typedef void *__CILK_JUMP_BUFFER[5];
+
+struct __cilkrts_pedigree {};
+struct __cilkrts_stack_frame {};
+struct __cilkrts_worker {};
+struct global_state_t {};
+
+enum {
+  __CILKRTS_ABI_VERSION = 1
+};
+
+enum {
+  CILK_FRAME_STOLEN           =    0x01,
+  CILK_FRAME_UNSYNCHED        =    0x02,
+  CILK_FRAME_DETACHED         =    0x04,
+  CILK_FRAME_EXCEPTION_PROBED =    0x08,
+  CILK_FRAME_EXCEPTING        =    0x10,
+  CILK_FRAME_LAST             =    0x80,
+  CILK_FRAME_EXITING          =  0x0100,
+  CILK_FRAME_SUSPENDED        =  0x8000,
+  CILK_FRAME_UNWINDING        = 0x10000
+};
+
+#define CILK_FRAME_VERSION (__CILKRTS_ABI_VERSION << 24)
+#define CILK_FRAME_VERSION_MASK  0xFF000000
+#define CILK_FRAME_FLAGS_MASK    0x00FFFFFF
+#define CILK_FRAME_VERSION_VALUE(_flags) (((_flags) & CILK_FRAME_VERSION_MASK) >> 24)
+#define CILK_FRAME_MBZ  (~ (CILK_FRAME_STOLEN           |       \
+                            CILK_FRAME_UNSYNCHED        |       \
+                            CILK_FRAME_DETACHED         |       \
+                            CILK_FRAME_EXCEPTION_PROBED |       \
+                            CILK_FRAME_EXCEPTING        |       \
+                            CILK_FRAME_LAST             |       \
+                            CILK_FRAME_EXITING          |       \
+                            CILK_FRAME_SUSPENDED        |       \
+                            CILK_FRAME_UNWINDING        |       \
+                            CILK_FRAME_VERSION_MASK))
+
+
+typedef uint32_t cilk32_t;
+typedef uint64_t cilk64_t;
+typedef void (*__cilk_abi_f32_t)(void *data, cilk32_t low, cilk32_t high);
+typedef void (*__cilk_abi_f64_t)(void *data, cilk64_t low, cilk64_t high);
+
+typedef void (__cilkrts_init)();
+
+typedef void (__cilkrts_enter_frame_1)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_enter_frame_fast_1)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_leave_frame)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_rethrow)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_sync)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_detach)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_pop_frame)(__cilkrts_stack_frame *sf);
+typedef int (__cilkrts_get_nworkers)();
+typedef __cilkrts_worker *(__cilkrts_get_tls_worker)();
+typedef __cilkrts_worker *(__cilkrts_get_tls_worker_fast)();
+typedef __cilkrts_worker *(__cilkrts_bind_thread_1)();
+
+typedef void (cilk_func)(__cilkrts_stack_frame *);
+
+typedef void (cilk_enter_begin)(uint32_t, __cilkrts_stack_frame *, void *, void *);
+typedef void (cilk_enter_helper_begin)(__cilkrts_stack_frame *, void *, void *);
+typedef void (cilk_enter_end)(__cilkrts_stack_frame *, void *);
+typedef void (cilk_detach_begin)(__cilkrts_stack_frame *);
+typedef void (cilk_detach_end)();
+typedef void (cilk_spawn_prepare)(__cilkrts_stack_frame *);
+typedef void (cilk_spawn_or_continue)(int);
+typedef void (cilk_sync_begin)(__cilkrts_stack_frame *);
+typedef void (cilk_sync_end)(__cilkrts_stack_frame *);
+typedef void (cilk_leave_begin)(__cilkrts_stack_frame *);
+typedef void (cilk_leave_end)();
+typedef void (__cilkrts_cilk_for_32)(__cilk_abi_f32_t body, void *data,
+                                     cilk32_t count, int grain);
+typedef void (__cilkrts_cilk_for_64)(__cilk_abi_f64_t body, void *data,
+                                     cilk64_t count, int grain);
+
+#define CILKRTS_FUNC(name, CGF) Get__cilkrts_##name(CGF)
+
+#define DEFAULT_GET_CILKRTS_FUNC(name)                                  \
+  static llvm::Function *Get__cilkrts_##name(llvm::Module& M) {         \
+    return llvm::cast<llvm::Function>(M.getOrInsertFunction(            \
+                                          "__cilkrts_"#name,            \
+                                          llvm::TypeBuilder<__cilkrts_##name, false>::get(M.getContext()) \
+                                                                        )); \
+  }
+
+//DEFAULT_GET_CILKRTS_FUNC(get_nworkers)
+#pragma GCC diagnostic ignored "-Wunused-function"
+static llvm::Function *Get__cilkrts_get_nworkers(llvm::Module& M) {
+  llvm::LLVMContext &C = M.getContext();
+  llvm::AttributeList AL;
+  AL = AL.addAttribute(C, llvm::AttributeList::FunctionIndex,
+                       llvm::Attribute::ReadNone);
+  // AL = AL.addAttribute(C, llvm::AttributeSet::FunctionIndex,
+  //                      llvm::Attribute::InaccessibleMemOnly);
+  AL = AL.addAttribute(C, llvm::AttributeList::FunctionIndex,
+                       llvm::Attribute::NoUnwind);
+  llvm::Function *F = llvm::cast<llvm::Function>(
+      M.getOrInsertFunction(
+          "__cilkrts_get_nworkers",
+          llvm::TypeBuilder<__cilkrts_get_nworkers, false>::get(C),
+          AL));
+  return F;
+}
+
+// TODO: set up these CILKRTS and CILK_CSI functions in a cleaner
+// way so we don't need these pragmas.
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(init)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(sync)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(rethrow)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(leave_frame)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(get_tls_worker)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(get_tls_worker_fast)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(bind_thread_1)
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(cilk_for_32)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(cilk_for_64)
+
+#define CILK_CSI_FUNC(name, CGF) Get_cilk_##name(CGF)
+
+#define GET_CILK_CSI_FUNC(name)                                         \
+  static llvm::Function *Get_cilk_##name(llvm::Module& M) {             \
+    return llvm::cast<llvm::Function>(M.getOrInsertFunction(            \
+                                          "cilk_"#name,                 \
+                                          llvm::TypeBuilder<cilk_##name, false>::get(M.getContext()) \
+                                                                        )); \
+  }
+
+#define GET_CILK_CSI_FUNC2(name)                                        \
+  static llvm::Function *Get_cilk_##name(llvm::Module& M) {             \
+    return llvm::cast<llvm::Function>(M.getOrInsertFunction(            \
+                                          "cilk_"#name,                 \
+                                          llvm::TypeBuilder<cilk_##name, false>::get(M.getContext()) \
+                                                                        )); \
+  }
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(enter_begin)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(enter_helper_begin)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(enter_end)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(detach_begin)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(detach_end)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC2(spawn_prepare)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC2(spawn_or_continue)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(sync_begin)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(sync_end)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(leave_begin)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(leave_end)
+
+  typedef std::map<llvm::LLVMContext*, llvm::StructType*> TypeBuilderCache;
+
+}  // namespace
+
+namespace llvm {
+
+/// Specializations of llvm::TypeBuilder for:
+///   __cilkrts_pedigree,
+///   __cilkrts_worker,
+///   __cilkrts_stack_frame
+template <bool X>
+class TypeBuilder<__cilkrts_pedigree, X> {
+public:
+  static StructType *get(LLVMContext &C) {
+    static TypeBuilderCache cache;
+    TypeBuilderCache::iterator I = cache.find(&C);
+    if (I != cache.end())
+      return I->second;
+    StructType *ExistingTy = StructType::getOrCreate(C, "struct.__cilkrts_pedigree");
+    cache[&C] = ExistingTy;
+    StructType *NewTy = StructType::create(C);
+    NewTy->setBody(
+        TypeBuilder<uint64_t,            X>::get(C), // rank
+        TypeBuilder<__cilkrts_pedigree*, X>::get(C)  // next
+                );
+    if (ExistingTy->isOpaque())
+      ExistingTy->setBody(NewTy->elements());
+    else
+      assert(ExistingTy->isLayoutIdentical(NewTy) &&
+             "Conflicting definition of tye struct.__cilkrts_pedigree");
+    return ExistingTy;
+  }
+  enum {
+    rank,
+    next
+  };
+};
+
+template <bool X>
+class TypeBuilder<__cilkrts_worker, X> {
+public:
+  static StructType *get(LLVMContext &C) {
+    static TypeBuilderCache cache;
+    TypeBuilderCache::iterator I = cache.find(&C);
+    if (I != cache.end())
+      return I->second;
+    // Try looking up this type by name.
+    StructType *Ty = StructType::getOrCreate(C, "struct.__cilkrts_worker");
+    assert(Ty->isOpaque() &&
+           "Conflicting definition of type struct.__cilkrts_worker.");
+    cache[&C] = Ty;
+    Ty->setBody(
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // tail
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // head
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // exc
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // protected_tail
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // ltq_limit
+        TypeBuilder<int32_t,                 X>::get(C), // self
+        TypeBuilder<void*,                   X>::get(C), // g
+        TypeBuilder<void*,                   X>::get(C), // l
+        TypeBuilder<void*,                   X>::get(C), // reducer_map
+        TypeBuilder<__cilkrts_stack_frame*,  X>::get(C), // current_stack_frame
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // saved_protected_tail
+        TypeBuilder<void*,                   X>::get(C), // sysdep
+        TypeBuilder<__cilkrts_pedigree,      X>::get(C)  // pedigree
+                );
+    return Ty;
+  }
+  enum {
+    tail,
+    head,
+    exc,
+    protected_tail,
+    ltq_limit,
+    self,
+    g,
+    l,
+    reducer_map,
+    current_stack_frame,
+    saved_protected_tail,
+    sysdep,
+    pedigree
+  };
+};
+
+template <bool X>
+class TypeBuilder<__cilkrts_stack_frame, X> {
+public:
+  static StructType *get(LLVMContext &C) {
+    static TypeBuilderCache cache;
+    TypeBuilderCache::iterator I = cache.find(&C);
+    if (I != cache.end())
+      return I->second;
+    StructType *Ty = StructType::create(C, "struct.__cilkrts_stack_frame");
+    cache[&C] = Ty;
+    Ty->setBody(
+        TypeBuilder<uint32_t,               X>::get(C), // flags
+        TypeBuilder<int32_t,                X>::get(C), // size
+        TypeBuilder<__cilkrts_stack_frame*, X>::get(C), // call_parent
+        TypeBuilder<__cilkrts_worker*,      X>::get(C), // worker
+        TypeBuilder<void*,                  X>::get(C), // except_data
+        TypeBuilder<__CILK_JUMP_BUFFER,     X>::get(C), // ctx
+        TypeBuilder<uint32_t,               X>::get(C), // mxcsr
+        TypeBuilder<uint16_t,               X>::get(C), // fpcsr
+        TypeBuilder<uint16_t,               X>::get(C), // reserved
+        TypeBuilder<__cilkrts_pedigree,     X>::get(C)  // parent_pedigree
+                );
+    return Ty;
+  }
+  enum {
+    flags,
+    size,
+    call_parent,
+    worker,
+    except_data,
+    ctx,
+    mxcsr,
+    fpcsr,
+    reserved,
+    parent_pedigree
+  };
+};
+
+} // namespace llvm
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace llvm {
+namespace cilk {
+
+Value *GetOrCreateWorker8(Function &F);
+void createSync(SyncInst &inst, ValueToValueMapTy &DetachCtxToStackFrame,
+                bool instrument = false);
+
+bool verifyDetachedCFG(const DetachInst &Detach, DominatorTree &DT,
+                       bool error = true);
+
+bool populateDetachedCFG(const DetachInst &Detach, DominatorTree &DT,
+                         SmallPtrSetImpl<BasicBlock *> &functionPieces,
+                         SmallVectorImpl<BasicBlock *> &reattachB,
+                         SmallPtrSetImpl<BasicBlock *> &ExitBlocks,
+                         bool replace, bool error = true);
+
+Function *extractDetachBodyToFunction(DetachInst &Detach,
+                                      DominatorTree &DT, AssumptionCache &AC,
+                                      CallInst **call = nullptr);
+
+Function *createDetach(DetachInst &Detach,
+                       ValueToValueMapTy &DetachCtxToStackFrame,
+                       DominatorTree &DT, AssumptionCache &AC,
+                       bool instrument = false);
+
+}  // end of cilk namespace
+}  // end of llvm namespace
+
+#endif
diff --git a/llvm/include/llvm/Transforms/Tapir/LoopSpawning.h b/llvm/include/llvm/Transforms/Tapir/LoopSpawning.h
new file mode 100644
index 00000000000000..df6718c99418c1
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Tapir/LoopSpawning.h
@@ -0,0 +1,37 @@
+//===---- LoopSpawning.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass modifies Tapir loops to spawn their iterations efficiently.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H
+#define LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// The LoopSpawning Pass.
+struct LoopSpawningPass : public PassInfoMixin<LoopSpawningPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+}
+
+#endif // LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H
diff --git a/llvm/include/llvm/Transforms/Tapir/Outline.h b/llvm/include/llvm/Transforms/Tapir/Outline.h
new file mode 100644
index 00000000000000..a11ef83007556d
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Tapir/Outline.h
@@ -0,0 +1,88 @@
+//===- llvm/Transforms/Tapir/Outline.h - Outlining for Tapir -*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines helper functions for outlining portions of code containing
+// Tapir instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_TAPIR_OUTLINE_H
+#define LLVM_TRANSFORMS_TAPIR_OUTLINE_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+namespace llvm {
+
+typedef SetVector<Value *> ValueSet;
+
+/// Find the inputs and outputs for a function outlined from the gives set of
+/// basic blocks.
+void findInputsOutputs(const SmallPtrSetImpl<BasicBlock *> &Blocks,
+                       ValueSet &Inputs,
+                       ValueSet &Outputs,
+                       const SmallPtrSetImpl<BasicBlock *> *ExitBlocks =
+                       nullptr);
+
+/// Clone Blocks into NewFunc, transforming the old arguments into references to
+/// VMap values.
+///
+/// TODO: Fix the std::vector part of the type of this function.
+void CloneIntoFunction(Function *NewFunc, const Function *OldFunc,
+                       std::vector<BasicBlock *> Blocks,
+                       ValueToValueMapTy &VMap,
+                       bool ModuleLevelChanges,
+                       SmallVectorImpl<ReturnInst *> &Returns,
+                       const StringRef NameSuffix,
+                       SmallPtrSetImpl<BasicBlock *> *ExitBlocks = nullptr,
+                       DISubprogram *SP = nullptr,
+                       ClonedCodeInfo *CodeInfo = nullptr,
+                       ValueMapTypeRemapper *TypeMapper = nullptr,
+                       ValueMaterializer *Materializer = nullptr);
+
+/// Create a helper function whose signature is based on Inputs and
+/// Outputs as follows: f(in0, ..., inN, out0, ..., outN)
+///
+/// TODO: Fix the std::vector part of the type of this function.
+Function *CreateHelper(const ValueSet &Inputs,
+                       const ValueSet &Outputs,
+                       std::vector<BasicBlock *> Blocks,
+                       BasicBlock *Header,
+                       const BasicBlock *OldEntry,
+                       const BasicBlock *OldExit,
+                       ValueToValueMapTy &VMap,
+                       Module *DestM,
+                       bool ModuleLevelChanges,
+                       SmallVectorImpl<ReturnInst *> &Returns,
+                       const StringRef NameSuffix,
+                       SmallPtrSetImpl<BasicBlock *> *ExitBlocks = nullptr,
+                       const Instruction *InputSyncRegion = nullptr,
+                       ClonedCodeInfo *CodeInfo = nullptr,
+                       ValueMapTypeRemapper *TypeMapper = nullptr,
+                       ValueMaterializer *Materializer = nullptr);
+
+// Add alignment assumptions to parameters of outlined function, based on known
+// alignment data in the caller.
+void AddAlignmentAssumptions(const Function *Caller,
+                             const ValueSet &Inputs,
+                             ValueToValueMapTy &VMap,
+                             const Instruction *CallSite,
+                             AssumptionCache *AC,
+                             DominatorTree *DT);
+
+} // End llvm namespace
+
+#endif
diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 5b16a2c0d0b1a3..4bc6bdc3378a27 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -97,6 +97,7 @@ struct CriticalEdgeSplittingOptions {
   bool MergeIdenticalEdges = false;
   bool DontDeleteUselessPHIs = false;
   bool PreserveLCSSA = false;
+  bool SplitDetachContinue = false;
 
   CriticalEdgeSplittingOptions(DominatorTree *DT = nullptr,
                                LoopInfo *LI = nullptr,
@@ -117,6 +118,11 @@ struct CriticalEdgeSplittingOptions {
     PreserveLCSSA = true;
     return *this;
   }
+
+  CriticalEdgeSplittingOptions &setSplitDetachContinue() {
+    SplitDetachContinue = true;
+    return *this;
+  }
 };
 
 /// If this edge is a critical edge, insert a new node to split the critical
diff --git a/llvm/include/llvm/Transforms/Utils/ModuleUtils.h b/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
index fee492be2a9023..5e33ba151fc592 100644
--- a/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
@@ -40,6 +40,13 @@ void appendToGlobalCtors(Module &M, Function *F, int Priority,
 void appendToGlobalDtors(Module &M, Function *F, int Priority,
                          Constant *Data = nullptr);
 
+// Validate the result of Module::getOrInsertFunction called for an
+// interface function of ComprehensiveStaticInstrumentation. If the
+// instrumented module defines a function with the same name, their
+// prototypes must match, otherwise getOrInsertFunction returns a
+// bitcast.
+Function *checkCsiInterfaceFunction(Constant *FuncOrBitcast);
+
 // Validate the result of Module::getOrInsertFunction called for an interface
 // function of given sanitizer. If the instrumented module defines a function
 // with the same name, their prototypes must match, otherwise
diff --git a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
index 5ddfbe2bf05881..5342bd1c418123 100644
--- a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -30,6 +30,7 @@ class AssumptionCache;
 /// ever one layer of bitcasts or GEPs between the alloca and the lifetime
 /// markers.
 bool isAllocaPromotable(const AllocaInst *AI);
+bool isAllocaParallelPromotable(const AllocaInst *AI, DominatorTree &DT);
 
 /// Promote the specified list of alloca instructions into scalar
 /// registers, inserting PHI nodes as appropriate.
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
index d02607acbbb579..355422e0e4b46f 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -54,6 +54,9 @@ class SSAUpdater {
   /// the vector.
   SmallVectorImpl<PHINode *> *InsertedPHIs;
 
+  /// This keeps track of which values are defined in detached blocks.
+  void *VID = nullptr;
+
 public:
   /// If InsertedPHIs is specified, it will be filled
   /// in with all PHI Nodes created by rewriting.
@@ -106,6 +109,8 @@ class SSAUpdater {
   /// merge the appropriate values, and this value isn't live out of the block.
   Value *GetValueInMiddleOfBlock(BasicBlock *BB);
 
+  bool GetValueIsDetachedInBlock(BasicBlock *BB);
+
   /// Rewrite a use of the symbolic value.
   ///
   /// This handles PHI nodes, which use their value in the corresponding
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index cab0f3e7157578..2b2d7a168ae729 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -66,6 +66,9 @@ class SSAUpdaterImpl {
     // Marker for existing PHIs that match.
     PhiT *PHITag = nullptr;
 
+    // Flag to indicate that the AvailableVal would be used after a Reattach.
+    bool DetachedUse = false;
+
     BBInfo(BlkT *ThisBB, ValT V)
       : BB(ThisBB), AvailableVal(V), DefBB(V ? this : nullptr) {}
   };
@@ -76,6 +79,10 @@ class SSAUpdaterImpl {
 
   SmallVectorImpl<PhiT *> *InsertedPHIs;
 
+  using  ValIsDetachedTy = DenseMap<BlkT *, bool>;
+
+  ValIsDetachedTy *ValIsDetached;
+
   using BlockListTy = SmallVectorImpl<BBInfo *>;
   using BBMapTy = DenseMap<BlkT *, BBInfo *>;
 
@@ -84,8 +91,9 @@ class SSAUpdaterImpl {
 
 public:
   explicit SSAUpdaterImpl(UpdaterT *U, AvailableValsTy *A,
-                          SmallVectorImpl<PhiT *> *Ins) :
-    Updater(U), AvailableVals(A), InsertedPHIs(Ins) {}
+                          SmallVectorImpl<PhiT *> *Ins,
+                          ValIsDetachedTy *D = nullptr) :
+      Updater(U), AvailableVals(A), InsertedPHIs(Ins), ValIsDetached(D) {}
 
   /// GetValue - Check to see if AvailableVals has an entry for the specified
   /// BB and if so, return it.  If not, construct SSA form by first
@@ -350,6 +358,10 @@ class SSAUpdaterImpl {
       (*AvailableVals)[Info->BB] = PHI;
     }
 
+    // Set of blocks with detached values that would be used except
+    // for Reattach.
+    SmallVector<BBInfo*, 64> DetachedValBlocks;
+
     // Now go back through the worklist in reverse order to fill in the
     // arguments for any new PHIs added in the forward traversal.
     for (typename BlockListTy::reverse_iterator I = BlockList->rbegin(),
@@ -368,14 +380,34 @@ class SSAUpdaterImpl {
       if (!PHI)
         continue;
 
+      // TODO: Change this so we do not assume that a block has at
+      // most one Detach and Reattach predecessor.
+      BBInfo *DetachPredInfo = nullptr;
+      BBInfo *ReattachPredInfo = nullptr;
       // Iterate through the block's predecessors.
       for (unsigned p = 0; p != Info->NumPreds; ++p) {
         BBInfo *PredInfo = Info->Preds[p];
         BlkT *Pred = PredInfo->BB;
+        if (Traits::BlockReattaches(Pred, Updater)) {
+          ReattachPredInfo = PredInfo;
+          continue;
+        }
         // Skip to the nearest preceding definition.
         if (PredInfo->DefBB != PredInfo)
           PredInfo = PredInfo->DefBB;
         Traits::AddPHIOperand(PHI, PredInfo->AvailableVal, Pred);
+        if (Traits::BlockDetaches(Pred, Updater))
+          DetachPredInfo = PredInfo;
+      }
+      if (ReattachPredInfo) {
+        assert(DetachPredInfo &&
+               "Reattach predecessor found with no corresponding Detach predecessor.");
+        // Available value from predecessor through a reattach is the
+        // same as that for the corresponding detach.
+        Traits::AddPHIOperand(PHI, DetachPredInfo->AvailableVal,
+                              ReattachPredInfo->BB);
+        if (DetachPredInfo->AvailableVal != ReattachPredInfo->AvailableVal)
+          DetachedValBlocks.push_back(Info);
       }
 
       LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *PHI << "\n");
@@ -383,6 +415,9 @@ class SSAUpdaterImpl {
       // If the client wants to know about all new instructions, tell it.
       if (InsertedPHIs) InsertedPHIs->push_back(PHI);
     }
+
+    // Mark any definitions that are detached from their use.
+    MarkDetachedDefs(&DetachedValBlocks);
   }
 
   /// FindExistingPHI - Look through the PHI nodes in a block to see if any of
@@ -416,7 +451,21 @@ class SSAUpdaterImpl {
       for (typename Traits::PHI_iterator I = Traits::PHI_begin(PHI),
              E = Traits::PHI_end(PHI); I != E; ++I) {
         ValT IncomingVal = I.getIncomingValue();
-        BBInfo *PredInfo = BBMap[I.getIncomingBlock()];
+        BlkT *BB = I.getIncomingBlock();
+
+        // Replace a reattach predecessor with the corresponding
+        // detach predecessor.
+        //
+        // TODO: Remove the implicit assumption here that each basic
+        // block has at most one reattach predecessor.
+        if (Traits::BlockReattaches(BB, Updater))
+          for (typename Traits::PHI_iterator PI = Traits::PHI_begin(PHI),
+                   PE = Traits::PHI_end(PHI); PI != PE; ++PI)
+            if (Traits::BlockDetaches(PI.getIncomingBlock(), Updater)) {
+              BB = PI.getIncomingBlock();
+              break;
+            }
+        BBInfo *PredInfo = BBMap[BB];
         // Skip to the nearest preceding definition.
         if (PredInfo->DefBB != PredInfo)
           PredInfo = PredInfo->DefBB;
@@ -459,6 +508,30 @@ class SSAUpdaterImpl {
         BBMap[BB]->AvailableVal = PHIVal;
       }
   }
+
+  /// MarkDetachedDefs - Mark all definitions that reach the basic
+  /// blocks in WorkList as having detached uses.
+  void MarkDetachedDefs(SmallVector<BBInfo*, 64> *WorkList) {
+    BBInfo *Info;
+    while (!WorkList->empty()) {
+      Info = WorkList->pop_back_val();
+      Info->DetachedUse = true;
+
+      ValT AvailableVal = Info->AvailableVal;
+      if (!AvailableVal)
+        continue;
+
+      if (ValIsDetached)
+        (*ValIsDetached)[Info->BB] = true;
+
+      if (Traits::ValueIsPHI(AvailableVal, Updater) ||
+          Info->DefBB != Info)
+        for (unsigned p = 0; p != Info->NumPreds; ++p)
+          if (!Info->Preds[p]->DetachedUse)
+            WorkList->push_back(Info->Preds[p]);
+    }
+  }
+
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/TapirUtils.h b/llvm/include/llvm/Transforms/Utils/TapirUtils.h
new file mode 100644
index 00000000000000..f8e4e98850c237
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/TapirUtils.h
@@ -0,0 +1,53 @@
+//===-- TapirUtils.h - Utility methods for Tapir ---------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file utility methods for handling code containing Tapir instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_TAPIRUITLS_H
+#define LLVM_TRANSFORMS_UTILS_TAPIRUTILS_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+class BasicBlock;
+class DetachInst;
+class DominatorTree;
+class TerminatorInst;
+
+/// Move static allocas in a block into the specified entry block.  Leave
+/// lifetime markers behind for those static allocas.  Returns true if the
+/// cloned block still contains dynamic allocas, which cannot be moved.
+bool MoveStaticAllocasInBlock(
+    BasicBlock *Entry, BasicBlock *Block,
+    SmallVectorImpl<Instruction *> &ExitPoints);
+
+/// Serialize the sub-CFG detached by the specified detach
+/// instruction.  Removes the detach instruction and returns a pointer
+/// to the branch instruction that replaces it.
+BranchInst* SerializeDetachedCFG(DetachInst *DI, DominatorTree *DT = nullptr);
+
+/// Get the entry basic block to the detached context that contains
+/// the specified block.
+const BasicBlock *GetDetachedCtx(const BasicBlock *BB);
+BasicBlock *GetDetachedCtx(BasicBlock *BB);
+
+/// isCriticalContinueEdge - Return true if the specified edge is a critical
+/// detach-continue edge.  Critical detach-continue edges are critical edges -
+/// from a block with multiple successors to a block with multiple predecessors
+/// - even after ignoring all reattach edges.
+bool isCriticalContinueEdge(const TerminatorInst *TI, unsigned SuccNum);
+
+} // End llvm namespace
+
+#endif
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index 3446aef399381f..9d1efdfeddc68e 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -24,6 +24,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFLAndersAliasAnalysis.h"
@@ -139,6 +141,42 @@ ModRefInfo AAResults::getModRefInfo(Instruction *I, const CallBase *Call2) {
   } else if (I->isFenceLike()) {
     // If this is a fence, just return ModRef.
     return ModRefInfo::ModRef;
+  } else if (auto D = dyn_cast<DetachInst>(I)) {
+    ModRefInfo Result = ModRefInfo::NoModRef;
+    SmallPtrSet<BasicBlock *, 32> Visited;
+    SmallVector<BasicBlock *, 32> WorkList;
+    WorkList.push_back(D->getDetached());
+    while (!WorkList.empty()) {
+      BasicBlock *BB = WorkList.pop_back_val();
+      if (!Visited.insert(BB).second)
+        continue;
+
+      // for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      for (Instruction &DI : *BB) {
+        // Fail fast if we encounter an invalid CFG.
+        assert(!(D == &DI) &&
+               "Detached CFG reaches its own Detach instruction.");
+
+        // Ignore sync instructions in this analysis
+        if (isa<SyncInst>(DI) || isa<DetachInst>(DI))
+          continue;
+
+        if (isa<LoadInst>(DI) || isa<StoreInst>(DI) ||
+            isa<AtomicCmpXchgInst>(DI) || isa<AtomicRMWInst>(DI) ||
+            DI.isFenceLike() || ImmutableCallSite(&DI))
+          Result = ModRefInfo(Result | getModRefInfo(&DI, Call));
+        if (&DI == Call.getInstruction())
+          return ModRefInfo::NoModRef;
+      }
+
+      // Add successors
+      const TerminatorInst *T = BB->getTerminator();
+      if (!isa<ReattachInst>(T) ||
+          T->getSuccessor(0) != D->getContinue())
+        for (unsigned idx = 0, max = T->getNumSuccessors(); idx < max; ++idx)
+          WorkList.push_back(T->getSuccessor(idx));
+    }
+    return Result;
   } else {
     // Otherwise, check if the call modifies or references the
     // location this memory access defines.  The best we can say
@@ -540,7 +578,90 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW,
   return ModRefInfo::ModRef;
 }
 
-/// Return information about whether a particular call site modifies
+ModRefInfo AAResults::getModRefInfo(const DetachInst *D,
+                                    const MemoryLocation &Loc) {
+  ModRefInfo Result = MRI_NoModRef;
+  SmallPtrSet<const BasicBlock *, 32> Visited;
+  SmallVector<const BasicBlock *, 32> WorkList;
+  WorkList.push_back(D->getSuccessor(0));
+  while (!WorkList.empty()) {
+    const BasicBlock *BB = WorkList.pop_back_val();
+    if (!Visited.insert(BB).second)
+      continue;
+
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Ignore sync instructions in this analysis
+      if (isa<SyncInst>(I))
+	continue;
+
+      // Fail fast if we encounter an invalid CFG.
+      assert(!(D == &*I) &&
+             "Invalid CFG found: Detached CFG reaches its own Detach instruction.");
+
+      if (!Loc.Ptr)
+        Result = ModRefInfo(Result | getModRefInfo(&*I));
+      else
+        Result = ModRefInfo(Result | getModRefInfo(&*I, Loc));
+
+      // Early-exit the moment we reach the top of the lattice.
+      if (Result == MRI_ModRef)
+	return Result;
+    }
+
+    // Add successors
+    const TerminatorInst *T = BB->getTerminator();
+    if (!isa<ReattachInst>(T) ||
+	T->getSuccessor(0) != D->getSuccessor(1))
+      for (unsigned idx = 0, max = T->getNumSuccessors(); idx < max; ++idx)
+	WorkList.push_back(T->getSuccessor(idx));
+  }
+
+  return Result;
+}
+
+ModRefInfo AAResults::getModRefInfo(const SyncInst *S,
+                                    const MemoryLocation &Loc) {
+  ModRefInfo Result = MRI_NoModRef;
+  SmallPtrSet<const BasicBlock *, 32> Visited;
+  SmallVector<const BasicBlock *, 32> WorkList;
+  WorkList.push_back(S->getParent());
+  while(!WorkList.empty()) {
+    const BasicBlock *BB = WorkList.pop_back_val();
+    if (!Visited.insert(BB).second)
+      continue;
+
+    const TerminatorInst *T = BB->getTerminator();
+    if (isa<DetachInst>(T)) {
+      Result = ModRefInfo(Result | getModRefInfo(T, Loc));
+
+      // Early-exit the moment we reach the top of the lattice.
+      if (Result == MRI_ModRef)
+	return Result;
+    }
+
+    // Add predecessors
+    for (const_pred_iterator PI = pred_begin(BB), E = pred_end(BB);
+	 PI != E; ++PI) {
+      const BasicBlock *Pred = *PI;
+      const TerminatorInst *PT = Pred->getTerminator();
+      // Ignore reattached predecessors and predecessors that end in
+      // syncs, because this sync does not wait on those predecessors.
+      if (isa<ReattachInst>(PT) || isa<SyncInst>(PT))
+	continue;
+      // If this block is detached, ignore the predecessor that
+      // detaches it.
+      if (const DetachInst *Det = dyn_cast<DetachInst>(PT))
+        if (Det->getDetached() == BB)
+          continue;
+
+      WorkList.push_back(Pred);
+    }
+  }
+
+  return Result;
+}
+
+/// \brief Return information about whether a particular call site modifies
 /// or reads the specified memory location \p MemLoc before instruction \p I
 /// in a BasicBlock. An ordered basic block \p OBB can be used to speed up
 /// instruction-ordering queries inside the BasicBlock containing \p I.
diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp
index bb8742123a0f08..be402b1990f75b 100644
--- a/llvm/lib/Analysis/Analysis.cpp
+++ b/llvm/lib/Analysis/Analysis.cpp
@@ -85,6 +85,8 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeLCSSAVerificationPassPass(Registry);
   initializeMemorySSAWrapperPassPass(Registry);
   initializeMemorySSAPrinterLegacyPassPass(Registry);
+  initializeDetachSSAWrapperPassPass(Registry);
+  initializeDetachSSAPrinterLegacyPassPass(Registry);
 }
 
 void LLVMInitializeAnalysis(LLVMPassRegistryRef R) {
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index c57d8ef69d69b7..1742260bb24e52 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMAnalysis
   Delinearization.cpp
   DemandedBits.cpp
   DependenceAnalysis.cpp
+  DetachSSA.cpp
   DivergenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
diff --git a/llvm/lib/Analysis/DetachSSA.cpp b/llvm/lib/Analysis/DetachSSA.cpp
new file mode 100644
index 00000000000000..545280e5c3e930
--- /dev/null
+++ b/llvm/lib/Analysis/DetachSSA.cpp
@@ -0,0 +1,1082 @@
+//===-- DetachSSA.cpp - Detach SSA Builder---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the DetachSSA class.
+//
+//===----------------------------------------------------------------===//
+#include "llvm/Analysis/DetachSSA.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+
+#define DEBUG_TYPE "detachssa"
+using namespace llvm;
+INITIALIZE_PASS_BEGIN(DetachSSAWrapperPass, "detachssa", "Detach SSA", false,
+                      true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(DetachSSAWrapperPass, "detachssa", "Detach SSA", false,
+                    true)
+
+INITIALIZE_PASS_BEGIN(DetachSSAPrinterLegacyPass, "print-detachssa",
+                      "Detach SSA Printer", false, false)
+INITIALIZE_PASS_DEPENDENCY(DetachSSAWrapperPass)
+INITIALIZE_PASS_END(DetachSSAPrinterLegacyPass, "print-detachssa",
+                    "Detach SSA Printer", false, false)
+
+static cl::opt<bool>
+    VerifyDetachSSA("verify-detachssa", cl::init(false), cl::Hidden,
+                    cl::desc("Verify DetachSSA in legacy printer pass."));
+
+namespace llvm {
+/// \brief An assembly annotator class to print Detach SSA information in
+/// comments.
+class DetachSSAAnnotatedWriter : public AssemblyAnnotationWriter {
+  friend class DetachSSA;
+  const DetachSSA *DSSA;
+
+public:
+  DetachSSAAnnotatedWriter(const DetachSSA *D) : DSSA(D) {}
+
+  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                        formatted_raw_ostream &OS) {
+    if (DetachAccess *DA = DSSA->getDetachAccess(BB))
+      OS << "; " << *DA << "\n";
+  }
+
+  virtual void emitInstructionAnnot(const Instruction *I,
+                                    formatted_raw_ostream &OS) {
+    if (DetachAccess *DA = DSSA->getDetachAccess(I))
+      OS << "; " << *DA << "\n";
+  }
+};
+
+struct RenamePassData {
+  DomTreeNode *DTN;
+  DomTreeNode::const_iterator ChildIt;
+  DetachAccess *IncomingVal;
+
+  RenamePassData(DomTreeNode *D, DomTreeNode::const_iterator It,
+                 DetachAccess *M)
+      : DTN(D), ChildIt(It), IncomingVal(M) {}
+  void swap(RenamePassData &RHS) {
+    std::swap(DTN, RHS.DTN);
+    std::swap(ChildIt, RHS.ChildIt);
+    std::swap(IncomingVal, RHS.IncomingVal);
+  }
+};
+} // anonymous namespace
+
+namespace llvm {
+
+void DetachSSA::renameSuccessorPhis(BasicBlock *BB, DetachAccess *IncomingVal,
+                                    bool RenameAllUses) {
+  // Pass through values to our successors
+  for (const BasicBlock *S : successors(BB)) {
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<DetachPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<DetachPhi>(&Accesses->front());
+    if (RenameAllUses) {
+      int PhiIndex = Phi->getBasicBlockIndex(BB);
+      assert(PhiIndex != -1 && "Incomplete phi during partial rename");
+      Phi->setIncomingValue(PhiIndex, IncomingVal);
+    } else
+      Phi->addIncoming(IncomingVal, BB);
+  }
+}
+
+/// \brief Rename a single basic block into DetachSSA form.
+/// Uses the standard SSA renaming algorithm.
+/// \returns The new incoming value.
+DetachAccess *DetachSSA::renameBlock(BasicBlock *BB, DetachAccess *IncomingVal,
+                                     bool RenameAllUses) {
+  auto It = PerBlockAccesses.find(BB);
+  // Skip most processing if the list is empty.
+  if (It != PerBlockAccesses.end()) {
+    AccessList *Accesses = It->second.get();
+    for (DetachAccess &L : *Accesses) {
+      if (DetachUseOrDef *DUD = dyn_cast<DetachUseOrDef>(&L)) {
+        if (DUD->getDefiningAccess() == nullptr || RenameAllUses)
+          DUD->setDefiningAccess(IncomingVal);
+        if (isa<DetachDef>(&L))
+          IncomingVal = &L;
+      } else {
+        IncomingVal = &L;
+      }
+    }
+  }
+  return IncomingVal;
+}
+
+/// \brief This is the standard SSA renaming algorithm.
+///
+/// We walk the dominator tree in preorder, renaming accesses, and then filling
+/// in phi nodes in our successors.
+void DetachSSA::renamePass(DomTreeNode *Root, DetachAccess *IncomingVal,
+                           SmallPtrSetImpl<BasicBlock *> &Visited,
+                           bool SkipVisited, bool RenameAllUses) {
+  SmallVector<RenamePassData, 32> WorkStack;
+  // Skip everything if we already renamed this block and we are skipping.
+  // Note: You can't sink this into the if, because we need it to occur
+  // regardless of whether we skip blocks or not.
+  bool AlreadyVisited = !Visited.insert(Root->getBlock()).second;
+  if (SkipVisited && AlreadyVisited)
+    return;
+
+  IncomingVal = renameBlock(Root->getBlock(), IncomingVal, RenameAllUses);
+  renameSuccessorPhis(Root->getBlock(), IncomingVal, RenameAllUses);
+  WorkStack.push_back({Root, Root->begin(), IncomingVal});
+
+  while (!WorkStack.empty()) {
+    DomTreeNode *Node = WorkStack.back().DTN;
+    DomTreeNode::const_iterator ChildIt = WorkStack.back().ChildIt;
+    IncomingVal = WorkStack.back().IncomingVal;
+
+    if (ChildIt == Node->end()) {
+      WorkStack.pop_back();
+    } else {
+      DomTreeNode *Child = *ChildIt;
+      ++WorkStack.back().ChildIt;
+      BasicBlock *BB = Child->getBlock();
+      // Note: You can't sink this into the if, because we need it to occur
+      // regardless of whether we skip blocks or not.
+      AlreadyVisited = !Visited.insert(BB).second;
+      if (SkipVisited && AlreadyVisited) {
+        // We already visited this during our renaming, which can happen when
+        // being asked to rename multiple blocks. Figure out the incoming val,
+        // which is the last def.
+        // Incoming value can only change if there is a block def, and in that
+        // case, it's the last block def in the list.
+        if (auto *BlockDefs = getWritableBlockDefs(BB))
+          IncomingVal = &*BlockDefs->rbegin();
+      } else
+        IncomingVal = renameBlock(BB, IncomingVal, RenameAllUses);
+      renameSuccessorPhis(BB, IncomingVal, RenameAllUses);
+      WorkStack.push_back({Child, Child->begin(), IncomingVal});
+    }
+  }
+}
+
+/// \brief This handles unreachable block accesses by deleting phi nodes in
+/// unreachable blocks, and marking all other unreachable DetachAccess's as
+/// being uses of the live on entry definition.
+void DetachSSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
+  assert(!DT->isReachableFromEntry(BB) &&
+         "Reachable block found while handling unreachable blocks");
+
+  // Make sure phi nodes in our reachable successors end up with a
+  // LiveOnEntryDef for our incoming edge, even though our block is forward
+  // unreachable.  We could just disconnect these blocks from the CFG fully,
+  // but we do not right now.
+  for (const BasicBlock *S : successors(BB)) {
+    if (!DT->isReachableFromEntry(S))
+      continue;
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<DetachPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<DetachPhi>(&Accesses->front());
+    Phi->addIncoming(LiveOnEntryDef.get(), BB);
+  }
+
+  auto It = PerBlockAccesses.find(BB);
+  if (It == PerBlockAccesses.end())
+    return;
+
+  auto &Accesses = It->second;
+  for (auto AI = Accesses->begin(), AE = Accesses->end(); AI != AE;) {
+    auto Next = std::next(AI);
+    // If we have a phi, just remove it. We are going to replace all
+    // users with live on entry.
+    if (auto *UseOrDef = dyn_cast<DetachUseOrDef>(AI))
+      UseOrDef->setDefiningAccess(LiveOnEntryDef.get());
+    else
+      Accesses->erase(AI);
+    AI = Next;
+  }
+}
+
+DetachSSA::DetachSSA(Function &Func, DominatorTree *DT)
+    : DT(DT), F(Func),
+      NextID(INVALID_DETACHACCESS_ID) {
+  buildDetachSSA();
+}
+
+DetachSSA::~DetachSSA() {
+  // Drop all our references
+  for (const auto &Pair : PerBlockAccesses)
+    for (DetachAccess &DA : *Pair.second)
+      DA.dropAllReferences();
+}
+
+DetachSSA::AccessList *DetachSSA::getOrCreateAccessList(const BasicBlock *BB) {
+  auto Res = PerBlockAccesses.insert(std::make_pair(BB, nullptr));
+
+  if (Res.second)
+    Res.first->second = make_unique<AccessList>();
+  return Res.first->second.get();
+}
+DetachSSA::DefsList *DetachSSA::getOrCreateDefsList(const BasicBlock *BB) {
+  auto Res = PerBlockDefs.insert(std::make_pair(BB, nullptr));
+
+  if (Res.second)
+    Res.first->second = make_unique<DefsList>();
+  return Res.first->second.get();
+}
+
+// /// This class is a batch walker of all DetachUse's in the program, and points
+// /// their defining access at the thing that actually clobbers them.  Because it
+// /// is a batch walker that touches everything, it does not operate like the
+// /// other walkers.  This walker is basically performing a top-down SSA renaming
+// /// pass, where the version stack is used as the cache.  This enables it to be
+// /// significantly more time and detach efficient than using the regular walker,
+// /// which is walking bottom-up.
+// class DetachSSA::OptimizeUses {
+// public:
+//   OptimizeUses(DetachSSA *DSSA, DetachSSAWalker *Walker, AliasAnalysis *AA,
+//                DominatorTree *DT)
+//       : DSSA(DSSA), Walker(Walker), AA(AA), DT(DT) {
+//     Walker = DSSA->getWalker();
+//   }
+
+//   void optimizeUses();
+
+// private:
+//   /// This represents where a given detachlocation is in the stack.
+//   struct MemlocStackInfo {
+//     // This essentially is keeping track of versions of the stack. Whenever
+//     // the stack changes due to pushes or pops, these versions increase.
+//     unsigned long StackEpoch;
+//     unsigned long PopEpoch;
+//     // This is the lower bound of places on the stack to check. It is equal to
+//     // the place the last stack walk ended.
+//     // Note: Correctness depends on this being initialized to 0, which densemap
+//     // does
+//     unsigned long LowerBound;
+//     const BasicBlock *LowerBoundBlock;
+//     // This is where the last walk for this detach location ended.
+//     unsigned long LastKill;
+//     bool LastKillValid;
+//   };
+//   void optimizeUsesInBlock(const BasicBlock *, unsigned long &, unsigned long &,
+//                            SmallVectorImpl<DetachAccess *> &,
+//                            DenseMap<DetachLocOrCall, MemlocStackInfo> &);
+//   DetachSSA *DSSA;
+//   DetachSSAWalker *Walker;
+//   AliasAnalysis *AA;
+//   DominatorTree *DT;
+// };
+
+// /// Optimize the uses in a given block This is basically the SSA renaming
+// /// algorithm, with one caveat: We are able to use a single stack for all
+// /// DetachUses.  This is because the set of *possible* reaching DetachDefs is
+// /// the same for every DetachUse.  The *actual* clobbering DetachDef is just
+// /// going to be some position in that stack of possible ones.
+// ///
+// /// We track the stack positions that each DetachLocation needs
+// /// to check, and last ended at.  This is because we only want to check the
+// /// things that changed since last time.  The same DetachLocation should
+// /// get clobbered by the same store (getModRefInfo does not use invariantness or
+// /// things like this, and if they start, we can modify DetachLocOrCall to
+// /// include relevant data)
+// void DetachSSA::OptimizeUses::optimizeUsesInBlock(
+//     const BasicBlock *BB, unsigned long &StackEpoch, unsigned long &PopEpoch,
+//     SmallVectorImpl<DetachAccess *> &VersionStack,
+//     DenseMap<DetachLocOrCall, MemlocStackInfo> &LocStackInfo) {
+
+//   /// If no accesses, nothing to do.
+//   DetachSSA::AccessList *Accesses = DSSA->getWritableBlockAccesses(BB);
+//   if (Accesses == nullptr)
+//     return;
+
+//   // Pop everything that doesn't dominate the current block off the stack,
+//   // increment the PopEpoch to account for this.
+//   while (true) {
+//     assert(
+//         !VersionStack.empty() &&
+//         "Version stack should have liveOnEntry sentinel dominating everything");
+//     BasicBlock *BackBlock = VersionStack.back()->getBlock();
+//     if (DT->dominates(BackBlock, BB))
+//       break;
+//     while (VersionStack.back()->getBlock() == BackBlock)
+//       VersionStack.pop_back();
+//     ++PopEpoch;
+//   }
+
+//   for (DetachAccess &DA : *Accesses) {
+//     auto *MU = dyn_cast<DetachUse>(&DA);
+//     if (!MU) {
+//       VersionStack.push_back(&DA);
+//       ++StackEpoch;
+//       continue;
+//     }
+
+//     if (isUseTriviallyOptimizableToLiveOnEntry(*AA, MU->getDetachInst())) {
+//       MU->setDefiningAccess(DSSA->getLiveOnEntryDef(), true);
+//       continue;
+//     }
+
+//     DetachLocOrCall UseMLOC(MU);
+//     auto &LocInfo = LocStackInfo[UseMLOC];
+//     // If the pop epoch changed, it means we've removed stuff from top of
+//     // stack due to changing blocks. We may have to reset the lower bound or
+//     // last kill info.
+//     if (LocInfo.PopEpoch != PopEpoch) {
+//       LocInfo.PopEpoch = PopEpoch;
+//       LocInfo.StackEpoch = StackEpoch;
+//       // If the lower bound was in something that no longer dominates us, we
+//       // have to reset it.
+//       // We can't simply track stack size, because the stack may have had
+//       // pushes/pops in the meantime.
+//       // XXX: This is non-optimal, but only is slower cases with heavily
+//       // branching dominator trees.  To get the optimal number of queries would
+//       // be to make lowerbound and lastkill a per-loc stack, and pop it until
+//       // the top of that stack dominates us.  This does not seem worth it ATM.
+//       // A much cheaper optimization would be to always explore the deepest
+//       // branch of the dominator tree first. This will guarantee this resets on
+//       // the smallest set of blocks.
+//       if (LocInfo.LowerBoundBlock && LocInfo.LowerBoundBlock != BB &&
+//           !DT->dominates(LocInfo.LowerBoundBlock, BB)) {
+//         // Reset the lower bound of things to check.
+//         // TODO: Some day we should be able to reset to last kill, rather than
+//         // 0.
+//         LocInfo.LowerBound = 0;
+//         LocInfo.LowerBoundBlock = VersionStack[0]->getBlock();
+//         LocInfo.LastKillValid = false;
+//       }
+//     } else if (LocInfo.StackEpoch != StackEpoch) {
+//       // If all that has changed is the StackEpoch, we only have to check the
+//       // new things on the stack, because we've checked everything before.  In
+//       // this case, the lower bound of things to check remains the same.
+//       LocInfo.PopEpoch = PopEpoch;
+//       LocInfo.StackEpoch = StackEpoch;
+//     }
+//     if (!LocInfo.LastKillValid) {
+//       LocInfo.LastKill = VersionStack.size() - 1;
+//       LocInfo.LastKillValid = true;
+//     }
+
+//     // At this point, we should have corrected last kill and LowerBound to be
+//     // in bounds.
+//     assert(LocInfo.LowerBound < VersionStack.size() &&
+//            "Lower bound out of range");
+//     assert(LocInfo.LastKill < VersionStack.size() &&
+//            "Last kill info out of range");
+//     // In any case, the new upper bound is the top of the stack.
+//     unsigned long UpperBound = VersionStack.size() - 1;
+
+//     if (UpperBound - LocInfo.LowerBound > MaxCheckLimit) {
+//       DEBUG(dbgs() << "DetachSSA skipping optimization of " << *MU << " ("
+//                    << *(MU->getDetachInst()) << ")"
+//                    << " because there are " << UpperBound - LocInfo.LowerBound
+//                    << " stores to disambiguate\n");
+//       // Because we did not walk, LastKill is no longer valid, as this may
+//       // have been a kill.
+//       LocInfo.LastKillValid = false;
+//       continue;
+//     }
+//     bool FoundClobberResult = false;
+//     while (UpperBound > LocInfo.LowerBound) {
+//       if (isa<DetachPhi>(VersionStack[UpperBound])) {
+//         // For phis, use the walker, see where we ended up, go there
+//         Instruction *UseInst = MU->getDetachInst();
+//         DetachAccess *Result = Walker->getClobberingDetachAccess(UseInst);
+//         // We are guaranteed to find it or something is wrong
+//         while (VersionStack[UpperBound] != Result) {
+//           assert(UpperBound != 0);
+//           --UpperBound;
+//         }
+//         FoundClobberResult = true;
+//         break;
+//       }
+
+//       DetachDef *MD = cast<DetachDef>(VersionStack[UpperBound]);
+//       // If the lifetime of the pointer ends at this instruction, it's live on
+//       // entry.
+//       if (!UseMLOC.IsCall && lifetimeEndsAt(MD, UseMLOC.getLoc(), *AA)) {
+//         // Reset UpperBound to liveOnEntryDef's place in the stack
+//         UpperBound = 0;
+//         FoundClobberResult = true;
+//         break;
+//       }
+//       if (instructionClobbersQuery(MD, MU, UseMLOC, *AA)) {
+//         FoundClobberResult = true;
+//         break;
+//       }
+//       --UpperBound;
+//     }
+//     // At the end of this loop, UpperBound is either a clobber, or lower bound
+//     // PHI walking may cause it to be < LowerBound, and in fact, < LastKill.
+//     if (FoundClobberResult || UpperBound < LocInfo.LastKill) {
+//       MU->setDefiningAccess(VersionStack[UpperBound], true);
+//       // We were last killed now by where we got to
+//       LocInfo.LastKill = UpperBound;
+//     } else {
+//       // Otherwise, we checked all the new ones, and now we know we can get to
+//       // LastKill.
+//       MU->setDefiningAccess(VersionStack[LocInfo.LastKill], true);
+//     }
+//     LocInfo.LowerBound = VersionStack.size() - 1;
+//     LocInfo.LowerBoundBlock = BB;
+//   }
+// }
+
+// /// Optimize uses to point to their actual clobbering definitions.
+// void DetachSSA::OptimizeUses::optimizeUses() {
+//   SmallVector<DetachAccess *, 16> VersionStack;
+//   DenseMap<DetachLocOrCall, MemlocStackInfo> LocStackInfo;
+//   VersionStack.push_back(DSSA->getLiveOnEntryDef());
+
+//   unsigned long StackEpoch = 1;
+//   unsigned long PopEpoch = 1;
+//   // We perform a non-recursive top-down dominator tree walk.
+//   for (const auto *DomNode : depth_first(DT->getRootNode()))
+//     optimizeUsesInBlock(DomNode->getBlock(), StackEpoch, PopEpoch, VersionStack,
+//                         LocStackInfo);
+// }
+
+void DetachSSA::placePHINodes(
+    const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks,
+    const DenseMap<const BasicBlock *, unsigned int> &BBNumbers) {
+  // Determine where our DetachPhi's should go
+  ForwardIDFCalculator IDFs(*DT);
+  IDFs.setDefiningBlocks(DefiningBlocks);
+  SmallVector<BasicBlock *, 32> IDFBlocks;
+  IDFs.calculate(IDFBlocks);
+
+  std::sort(IDFBlocks.begin(), IDFBlocks.end(),
+            [&BBNumbers](const BasicBlock *A, const BasicBlock *B) {
+              return BBNumbers.lookup(A) < BBNumbers.lookup(B);
+            });
+
+  // Now place DetachPhi nodes.
+  for (auto &BB : IDFBlocks)
+    createDetachPhi(BB);
+}
+
+void DetachSSA::buildDetachSSA() {
+  BasicBlock &StartingPoint = F.getEntryBlock();
+  LiveOnEntryDef = make_unique<DetachDef>(F.getContext(), nullptr, nullptr,
+                                          &StartingPoint, NextID++);
+  DenseMap<const BasicBlock *, unsigned int> BBNumbers;
+  unsigned NextBBNum = 0;
+
+  // We maintain lists of detach accesses per block, trading memory for time. We
+  // could just look up the detach access for every possible instruction in the
+  // stream.
+  SmallPtrSet<BasicBlock *, 32> DefiningBlocks;
+  // Go through each block, figure out where defs occur, and chain together all
+  // the accesses.
+  for (BasicBlock &B : F) {
+    BBNumbers[&B] = NextBBNum++;
+    bool InsertIntoDef = false;
+    AccessList *Accesses = nullptr;
+    DefsList *Defs = nullptr;
+    if (isa<SyncInst>(B.getTerminator()) ||
+        isa<DetachInst>(B.getTerminator())) {
+      DetachUseOrDef *DUD = new DetachDef(B.getContext(), nullptr,
+                                          B.getTerminator(), &B,
+                                          NextID++);
+      ValueToDetachAccess[B.getTerminator()] = DUD;
+
+      if (!Accesses)
+        Accesses = getOrCreateAccessList(&B);
+      Accesses->push_back(DUD);
+      InsertIntoDef = true;
+      if (!Defs)
+        Defs = getOrCreateDefsList(&B);
+      Defs->push_back(*DUD);
+    }
+    if (InsertIntoDef)
+      DefiningBlocks.insert(&B);
+  }
+  placePHINodes(DefiningBlocks, BBNumbers);
+
+  // Now do regular SSA renaming on the DetachDef/DetachUse. Visited will get
+  // filled in with all blocks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+  renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
+
+  // CachingWalker *Walker = getWalkerImpl();
+
+  // // We're doing a batch of updates; don't drop useful caches between them.
+  // Walker->setAutoResetWalker(false);
+  // OptimizeUses(this, Walker, AA, DT).optimizeUses();
+  // Walker->setAutoResetWalker(true);
+  // Walker->resetClobberWalker();
+
+  // Mark the uses in unreachable blocks as live on entry, so that they go
+  // somewhere.
+  for (auto &BB : F)
+    if (!Visited.count(&BB))
+      markUnreachableAsLiveOnEntry(&BB);
+}
+
+// This is a helper function used by the creation routines. It places NewAccess
+// into the access and defs lists for a given basic block, at the given
+// insertion point.
+void DetachSSA::insertIntoListsForBlock(DetachAccess *NewAccess,
+                                        const BasicBlock *BB,
+                                        InsertionPlace Point) {
+  auto *Accesses = getOrCreateAccessList(BB);
+  if (Point == Beginning) {
+    // If it's a phi node, it goes first, otherwise, it goes after any phi
+    // nodes.
+    if (isa<DetachPhi>(NewAccess)) {
+      Accesses->push_front(NewAccess);
+      auto *Defs = getOrCreateDefsList(BB);
+      Defs->push_front(*NewAccess);
+    } else {
+      auto AI = find_if_not(
+          *Accesses, [](const DetachAccess &DA) { return isa<DetachPhi>(DA); });
+      Accesses->insert(AI, NewAccess);
+      if (!isa<DetachUse>(NewAccess)) {
+        auto *Defs = getOrCreateDefsList(BB);
+        auto DI = find_if_not(
+            *Defs, [](const DetachAccess &DA) { return isa<DetachPhi>(DA); });
+        Defs->insert(DI, *NewAccess);
+      }
+    }
+  } else {
+    Accesses->push_back(NewAccess);
+    if (!isa<DetachUse>(NewAccess)) {
+      auto *Defs = getOrCreateDefsList(BB);
+      Defs->push_back(*NewAccess);
+    }
+  }
+  BlockNumberingValid.erase(BB);
+}
+
+void DetachSSA::insertIntoListsBefore(DetachAccess *What, const BasicBlock *BB,
+                                      AccessList::iterator InsertPt) {
+  auto *Accesses = getWritableBlockAccesses(BB);
+  bool WasEnd = InsertPt == Accesses->end();
+  Accesses->insert(AccessList::iterator(InsertPt), What);
+  if (!isa<DetachUse>(What)) {
+    auto *Defs = getOrCreateDefsList(BB);
+    // If we got asked to insert at the end, we have an easy job, just shove it
+    // at the end. If we got asked to insert before an existing def, we also get
+    // an terator. If we got asked to insert before a use, we have to hunt for
+    // the next def.
+    if (WasEnd) {
+      Defs->push_back(*What);
+    } else if (isa<DetachDef>(InsertPt)) {
+      Defs->insert(InsertPt->getDefsIterator(), *What);
+    } else {
+      while (InsertPt != Accesses->end() && !isa<DetachDef>(InsertPt))
+        ++InsertPt;
+      // Either we found a def, or we are inserting at the end
+      if (InsertPt == Accesses->end())
+        Defs->push_back(*What);
+      else
+        Defs->insert(InsertPt->getDefsIterator(), *What);
+    }
+  }
+  BlockNumberingValid.erase(BB);
+}
+
+// Move What before Where in the IR.  The end result is that What will belong to
+// the right lists and have the right Block set, but will not otherwise be
+// correct. It will not have the right defining access, and if it is a def,
+// things below it will not properly be updated.
+void DetachSSA::moveTo(DetachUseOrDef *What, BasicBlock *BB,
+                       AccessList::iterator Where) {
+  // Keep it in the lookup tables, remove from the lists
+  removeFromLists(What, false);
+  What->setBlock(BB);
+  insertIntoListsBefore(What, BB, Where);
+}
+
+void DetachSSA::moveTo(DetachUseOrDef *What, BasicBlock *BB,
+                       InsertionPlace Point) {
+  removeFromLists(What, false);
+  What->setBlock(BB);
+  insertIntoListsForBlock(What, BB, Point);
+}
+
+DetachPhi *DetachSSA::createDetachPhi(BasicBlock *BB) {
+  assert(!getDetachAccess(BB) && "DetachPhi already exists for this BB");
+  DetachPhi *Phi = new DetachPhi(BB->getContext(), BB, NextID++);
+  // Phi's always are placed at the front of the block.
+  insertIntoListsForBlock(Phi, BB, Beginning);
+  ValueToDetachAccess[BB] = Phi;
+  return Phi;
+}
+
+// DetachUseOrDef *DetachSSA::createDefinedAccess(Instruction *I,
+//                                                DetachAccess *Definition) {
+//   assert(!isa<PHINode>(I) && "Cannot create a defined access for a PHI");
+//   DetachUseOrDef *NewAccess = createNewAccess(I);
+//   assert(
+//       NewAccess != nullptr &&
+//       "Tried to create a detach access for a non-detach touching instruction");
+//   NewAccess->setDefiningAccess(Definition);
+//   return NewAccess;
+// }
+
+// /// \brief Helper function to create new detach accesses
+// DetachUseOrDef *DetachSSA::createNewAccess(Instruction *I) {
+//   bool Def = isa<DetachInst>(I);
+//   bool Use = isa<SyncInst>(I);
+
+//   if (!Def && !Use)
+//     return nullptr;
+
+//   DetachUseOrDef *DUD;
+//   if (Def)
+//     DUD = new DetachDef(I->getContext, nullptr, I,
+//                         cast<DetachInst>(I)->getContinue(), NextID++);
+//   else if (Use)
+//     DUD = new DetachUse(I->getContext, nullptr, I, I->getParent());
+//   ValueToDetachAccess[I] = DUD;
+//   return DUD;
+// }
+
+/// \brief Returns true if \p Replacer dominates \p Replacee .
+bool DetachSSA::dominatesUse(const DetachAccess *Replacer,
+                             const DetachAccess *Replacee) const {
+  if (isa<DetachUseOrDef>(Replacee))
+    return DT->dominates(Replacer->getBlock(), Replacee->getBlock());
+  const auto *DP = cast<DetachPhi>(Replacee);
+  // For a phi node, the use occurs in the predecessor block of the phi node.
+  // Since we may occur multiple times in the phi node, we have to check each
+  // operand to ensure Replacer dominates each operand where Replacee occurs.
+  for (const Use &Arg : DP->operands()) {
+    if (Arg.get() != Replacee &&
+        !DT->dominates(Replacer->getBlock(), DP->getIncomingBlock(Arg)))
+      return false;
+  }
+  return true;
+}
+
+/// \brief Properly remove \p DA from all of DetachSSA's lookup tables.
+void DetachSSA::removeFromLookups(DetachAccess *DA) {
+  assert(DA->use_empty() &&
+         "Trying to remove detach access that still has uses");
+  BlockNumbering.erase(DA);
+  if (DetachUseOrDef *MUD = dyn_cast<DetachUseOrDef>(DA))
+    MUD->setDefiningAccess(nullptr);
+  // // Invalidate our walker's cache if necessary
+  // if (!isa<DetachUse>(DA))
+  //   Walker->invalidateInfo(DA);
+  // The call below to erase will destroy DA, so we can't change the order we
+  // are doing things here
+  Value *DAInst;
+  if (DetachUseOrDef *DUD = dyn_cast<DetachUseOrDef>(DA)) {
+    DAInst = DUD->getDAInst();
+  } else {
+    DAInst = DA->getBlock();
+  }
+  auto VDA = ValueToDetachAccess.find(DAInst);
+  if (VDA->second == DA)
+    ValueToDetachAccess.erase(VDA);
+}
+
+/// \brief Properly remove \p DA from all of DetachSSA's lists.
+///
+/// Because of the way the intrusive list and use lists work, it is important to
+/// do removal in the right order.
+/// ShouldDelete defaults to true, and will cause the detach access to also be
+/// deleted, not just removed.
+void DetachSSA::removeFromLists(DetachAccess *DA, bool ShouldDelete) {
+  // The access list owns the reference, so we erase it from the non-owning list
+  // first.
+  if (!isa<DetachUse>(DA)) {
+    auto DefsIt = PerBlockDefs.find(DA->getBlock());
+    std::unique_ptr<DefsList> &Defs = DefsIt->second;
+    Defs->remove(*DA);
+    if (Defs->empty())
+      PerBlockDefs.erase(DefsIt);
+  }
+
+  // The erase call here will delete it. If we don't want it deleted, we call
+  // remove instead.
+  auto AccessIt = PerBlockAccesses.find(DA->getBlock());
+  std::unique_ptr<AccessList> &Accesses = AccessIt->second;
+  if (ShouldDelete)
+    Accesses->erase(DA);
+  else
+    Accesses->remove(DA);
+
+  if (Accesses->empty())
+    PerBlockAccesses.erase(AccessIt);
+}
+
+void DetachSSA::print(raw_ostream &OS) const {
+  DetachSSAAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DetachSSA::dump() const { print(dbgs()); }
+#endif
+
+void DetachSSA::verifyDetachSSA() const {
+  verifyDefUses(F);
+  verifyDomination(F);
+  verifyOrdering(F);
+  // Walker->verify(this);
+}
+
+/// \brief Verify that the order and existence of DetachAccesses matches the
+/// order and existence of detach affecting instructions.
+void DetachSSA::verifyOrdering(Function &F) const {
+  // Walk all the blocks, comparing what the lookups think and what the access
+  // lists think, as well as the order in the blocks vs the order in the access
+  // lists.
+  SmallVector<DetachAccess *, 32> ActualAccesses;
+  SmallVector<DetachAccess *, 32> ActualDefs;
+  for (BasicBlock &B : F) {
+    const AccessList *AL = getBlockAccesses(&B);
+    const auto *DL = getBlockDefs(&B);
+    DetachAccess *Phi = getDetachAccess(&B);
+    if (Phi) {
+      ActualAccesses.push_back(Phi);
+      ActualDefs.push_back(Phi);
+    }
+
+    for (Instruction &I : B) {
+      DetachAccess *DA = getDetachAccess(&I);
+      assert((!DA || (AL && (isa<DetachUse>(DA) || DL))) &&
+             "We have detach affecting instructions "
+             "in this block but they are not in the "
+             "access list or defs list");
+      if (DA) {
+        ActualAccesses.push_back(DA);
+        if (isa<DetachDef>(DA))
+          ActualDefs.push_back(DA);
+      }
+    }
+    // Either we hit the assert, really have no accesses, or we have both
+    // accesses and an access list.
+    // Same with defs.
+    if (!AL && !DL)
+      continue;
+    assert(AL->size() == ActualAccesses.size() &&
+           "We don't have the same number of accesses in the block as on the "
+           "access list");
+    assert((DL || ActualDefs.size() == 0) &&
+           "Either we should have a defs list, or we should have no defs");
+    assert((!DL || DL->size() == ActualDefs.size()) &&
+           "We don't have the same number of defs in the block as on the "
+           "def list");
+    auto ALI = AL->begin();
+    auto AAI = ActualAccesses.begin();
+    while (ALI != AL->end() && AAI != ActualAccesses.end()) {
+      assert(&*ALI == *AAI && "Not the same accesses in the same order");
+      ++ALI;
+      ++AAI;
+    }
+    ActualAccesses.clear();
+    if (DL) {
+      auto DLI = DL->begin();
+      auto ADI = ActualDefs.begin();
+      while (DLI != DL->end() && ADI != ActualDefs.end()) {
+        assert(&*DLI == *ADI && "Not the same defs in the same order");
+        ++DLI;
+        ++ADI;
+      }
+    }
+    ActualDefs.clear();
+  }
+}
+
+/// \brief Verify the domination properties of DetachSSA by checking that each
+/// definition dominates all of its uses.
+void DetachSSA::verifyDomination(Function &F) const {
+#ifndef NDEBUG
+  for (BasicBlock &B : F) {
+    // Phi nodes are attached to basic blocks
+    if (DetachPhi *DP = getDetachAccess(&B))
+      for (const Use &U : DP->uses())
+        assert(dominates(DP, U) && "Detach PHI does not dominate it's uses");
+
+    for (Instruction &I : B) {
+      DetachAccess *MD = dyn_cast_or_null<DetachDef>(getDetachAccess(&I));
+      if (!MD)
+        continue;
+
+      for (const Use &U : MD->uses())
+        assert(dominates(MD, U) && "Detach Def does not dominate it's uses");
+    }
+  }
+#endif
+}
+
+/// \brief Verify the def-use lists in DetachSSA, by verifying that \p Use
+/// appears in the use list of \p Def.
+
+void DetachSSA::verifyUseInDefs(DetachAccess *Def, DetachAccess *Use) const {
+#ifndef NDEBUG
+  if (!Def)
+    assert(isLiveOnEntryDef(Use) &&
+           "Null def but use not point to live on entry def");
+  else
+    assert(is_contained(Def->users(), Use) &&
+           "Did not find use in def's use list");
+#endif
+}
+
+/// \brief Verify the immediate use information, by walking all the detach
+/// accesses and verifying that, for each use, it appears in the
+/// appropriate def's use list
+void DetachSSA::verifyDefUses(Function &F) const {
+  for (BasicBlock &B : F) {
+    // Phi nodes are attached to basic blocks
+    if (DetachPhi *Phi = getDetachAccess(&B)) {
+      assert(Phi->getNumOperands() == static_cast<unsigned>(std::distance(
+                                          pred_begin(&B), pred_end(&B))) &&
+             "Incomplete DetachPhi Node");
+      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+        verifyUseInDefs(Phi->getIncomingValue(I), Phi);
+    }
+
+    for (Instruction &I : B) {
+      if (DetachUseOrDef *DA = getDetachAccess(&I)) {
+        verifyUseInDefs(DA->getDefiningAccess(), DA);
+      }
+    }
+  }
+}
+
+DetachUseOrDef *DetachSSA::getDetachAccess(const Instruction *I) const {
+  return cast_or_null<DetachUseOrDef>(ValueToDetachAccess.lookup(I));
+}
+
+DetachPhi *DetachSSA::getDetachAccess(const BasicBlock *BB) const {
+  return cast_or_null<DetachPhi>(ValueToDetachAccess.lookup(cast<Value>(BB)));
+}
+
+/// Perform a local numbering on blocks so that instruction ordering can be
+/// determined in constant time.
+/// TODO: We currently just number in order.  If we numbered by N, we could
+/// allow at least N-1 sequences of insertBefore or insertAfter (and at least
+/// log2(N) sequences of mixed before and after) without needing to invalidate
+/// the numbering.
+void DetachSSA::renumberBlock(const BasicBlock *B) const {
+  // The pre-increment ensures the numbers really start at 1.
+  unsigned long CurrentNumber = 0;
+  const AccessList *AL = getBlockAccesses(B);
+  assert(AL != nullptr && "Asking to renumber an empty block");
+  for (const auto &I : *AL)
+    BlockNumbering[&I] = ++CurrentNumber;
+  BlockNumberingValid.insert(B);
+}
+
+/// \brief Determine, for two detach accesses in the same block,
+/// whether \p Dominator dominates \p Dominatee.
+/// \returns True if \p Dominator dominates \p Dominatee.
+bool DetachSSA::locallyDominates(const DetachAccess *Dominator,
+                                 const DetachAccess *Dominatee) const {
+
+  const BasicBlock *DominatorBlock = Dominator->getBlock();
+
+  assert((DominatorBlock == Dominatee->getBlock()) &&
+         "Asking for local domination when accesses are in different blocks!");
+  // A node dominates itself.
+  if (Dominatee == Dominator)
+    return true;
+
+  // When Dominatee is defined on function entry, it is not dominated by another
+  // detach access.
+  if (isLiveOnEntryDef(Dominatee))
+    return false;
+
+  // When Dominator is defined on function entry, it dominates the other detach
+  // access.
+  if (isLiveOnEntryDef(Dominator))
+    return true;
+
+  if (!BlockNumberingValid.count(DominatorBlock))
+    renumberBlock(DominatorBlock);
+
+  unsigned long DominatorNum = BlockNumbering.lookup(Dominator);
+  // All numbers start with 1
+  assert(DominatorNum != 0 && "Block was not numbered properly");
+  unsigned long DominateeNum = BlockNumbering.lookup(Dominatee);
+  assert(DominateeNum != 0 && "Block was not numbered properly");
+  return DominatorNum < DominateeNum;
+}
+
+bool DetachSSA::dominates(const DetachAccess *Dominator,
+                          const DetachAccess *Dominatee) const {
+  if (Dominator == Dominatee)
+    return true;
+
+  if (isLiveOnEntryDef(Dominatee))
+    return false;
+
+  if (Dominator->getBlock() != Dominatee->getBlock())
+    return DT->dominates(Dominator->getBlock(), Dominatee->getBlock());
+  return locallyDominates(Dominator, Dominatee);
+}
+
+bool DetachSSA::dominates(const DetachAccess *Dominator,
+                          const Use &Dominatee) const {
+  if (DetachPhi *DP = dyn_cast<DetachPhi>(Dominatee.getUser())) {
+    BasicBlock *UseBB = DP->getIncomingBlock(Dominatee);
+    // The def must dominate the incoming block of the phi.
+    if (UseBB != Dominator->getBlock())
+      return DT->dominates(Dominator->getBlock(), UseBB);
+    // If the UseBB and the DefBB are the same, compare locally.
+    return locallyDominates(Dominator, cast<DetachAccess>(Dominatee));
+  }
+  // If it's not a PHI node use, the normal dominates can already handle it.
+  return dominates(Dominator, cast<DetachAccess>(Dominatee.getUser()));
+}
+
+void DetachAccess::print(raw_ostream &OS) const {
+  switch (getValueID()) {
+  case DetachPhiVal: return static_cast<const DetachPhi *>(this)->print(OS);
+  case DetachDefVal: return static_cast<const DetachDef *>(this)->print(OS);
+  case DetachUseVal: return static_cast<const DetachUse *>(this)->print(OS);
+  }
+  llvm_unreachable("invalid value id");
+}
+
+void DetachDef::print(raw_ostream &OS) const {
+  DetachAccess *UO = getDefiningAccess();
+
+  OS << getID() << " = DetachDef(";
+  if (UO && UO->getID())
+    OS << UO->getID();
+  OS << ')';
+}
+
+void DetachPhi::print(raw_ostream &OS) const {
+  bool First = true;
+  OS << getID() << " = DetachPhi(";
+  for (const auto &Op : operands()) {
+    BasicBlock *BB = getIncomingBlock(Op);
+    DetachAccess *DA = cast<DetachAccess>(Op);
+    if (!First)
+      OS << ',';
+    else
+      First = false;
+
+    OS << '{';
+    if (BB->hasName())
+      OS << BB->getName();
+    else
+      BB->printAsOperand(OS, false);
+    OS << ',';
+    if (unsigned ID = DA->getID())
+      OS << ID;
+    OS << '}';
+  }
+  OS << ')';
+}
+
+void DetachUse::print(raw_ostream &OS) const {
+  DetachAccess *UO = getDefiningAccess();
+  OS << "DetachUse(";
+  if (UO && UO->getID())
+    OS << UO->getID();
+  OS << ')';
+}
+
+void DetachAccess::dump() const {
+// Cannot completely remove virtual function even in release mode.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  print(dbgs());
+  dbgs() << "\n";
+#endif
+}
+
+char DetachSSAPrinterLegacyPass::ID = 0;
+
+DetachSSAPrinterLegacyPass::DetachSSAPrinterLegacyPass() : FunctionPass(ID) {
+  initializeDetachSSAPrinterLegacyPassPass(*PassRegistry::getPassRegistry());
+}
+
+void DetachSSAPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<DetachSSAWrapperPass>();
+  AU.addPreserved<DetachSSAWrapperPass>();
+}
+
+bool DetachSSAPrinterLegacyPass::runOnFunction(Function &F) {
+  auto &DSSA = getAnalysis<DetachSSAWrapperPass>().getDSSA();
+  DSSA.print(dbgs());
+  if (VerifyDetachSSA)
+    DSSA.verifyDetachSSA();
+  return false;
+}
+
+AnalysisKey DetachSSAAnalysis::Key;
+
+DetachSSAAnalysis::Result DetachSSAAnalysis::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  return DetachSSAAnalysis::Result(make_unique<DetachSSA>(F, &DT));
+}
+
+PreservedAnalyses DetachSSAPrinterPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  OS << "DetachSSA for function: " << F.getName() << "\n";
+  AM.getResult<DetachSSAAnalysis>(F).getDSSA().print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses DetachSSAVerifierPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  AM.getResult<DetachSSAAnalysis>(F).getDSSA().verifyDetachSSA();
+
+  return PreservedAnalyses::all();
+}
+
+char DetachSSAWrapperPass::ID = 0;
+
+DetachSSAWrapperPass::DetachSSAWrapperPass() : FunctionPass(ID) {
+  initializeDetachSSAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+void DetachSSAWrapperPass::releaseMemory() { DSSA.reset(); }
+
+void DetachSSAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+}
+
+bool DetachSSAWrapperPass::runOnFunction(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DSSA.reset(new DetachSSA(F, &DT));
+  return false;
+}
+
+void DetachSSAWrapperPass::verifyAnalysis() const { DSSA->verifyDetachSSA(); }
+
+void DetachSSAWrapperPass::print(raw_ostream &OS, const Module *M) const {
+  DSSA->print(OS);
+}
+} // namespace llvm
+
+void DetachPhi::deleteMe(DerivedUser *Self) {
+  delete static_cast<DetachPhi *>(Self);
+}
+
+void DetachDef::deleteMe(DerivedUser *Self) {
+  delete static_cast<DetachDef *>(Self);
+}
+
+void DetachUse::deleteMe(DerivedUser *Self) {
+  delete static_cast<DetachUse *>(Self);
+}
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index 6a5567ed765bb2..7df5d9a8c03da8 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -137,7 +137,7 @@ class MemoryLocOrCall {
       IsCall = false;
       // There is no such thing as a memorylocation for a fence inst, and it is
       // unique in that regard.
-      if (!isa<FenceInst>(Inst))
+      if (!isa<FenceInst>(Inst) && !isa<SyncInst>(Inst))
         Loc = MemoryLocation::get(Inst);
     }
   }
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index eab7ec81953609..6b4e0e0207fcf9 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -859,6 +859,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(invoke,      Invoke);
   INSTKEYWORD(resume,      Resume);
   INSTKEYWORD(unreachable, Unreachable);
+  INSTKEYWORD(detach,      Detach);
+  INSTKEYWORD(reattach,    Reattach);
+  INSTKEYWORD(sync,        Sync);
 
   INSTKEYWORD(alloca,      Alloca);
   INSTKEYWORD(load,        Load);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index ee634505581e81..6c4cd4207c61cf 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -5577,6 +5577,9 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
       Inst->setFastMathFlags(FMF);
     return false;
   }
+  case lltok::kw_detach:      return ParseDetach(Inst, PFS);
+  case lltok::kw_reattach:    return ParseReattach(Inst, PFS);
+  case lltok::kw_sync:        return ParseSync(Inst, PFS);
   // Binary Operators.
   case lltok::kw_add:
   case lltok::kw_sub:
@@ -5776,6 +5779,89 @@ bool LLParser::ParseBr(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
+/// ParseDetach
+///   ::= 'detach' within SyncRegion ',' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseDetach(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc, Loc2;
+  Value *SR;
+  BasicBlock *Op1, *Op2;
+
+  if (ParseToken(lltok::kw_within, "expected 'within' after detach"))
+    return true;
+
+  if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar &&
+      Lex.getKind() != lltok::LocalVarID)
+    return TokError("expected scope value for detach");
+
+  if (ParseValue(Type::getTokenTy(Context), SR, PFS))
+    return true;
+
+  if (ParseToken(lltok::comma, "expected ',' after detach scope"))
+    return true;
+
+  if (ParseTypeAndBasicBlock(Op1, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after detached destination") ||
+      ParseTypeAndBasicBlock(Op2, Loc2, PFS))
+    return true;
+
+  Inst = DetachInst::Create(Op1, Op2, SR);
+  return false;
+}
+
+/// ParseReattach
+///   ::= 'reattach' within SyncRegion ',' TypeAndValue
+bool LLParser::ParseReattach(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *SR;
+  BasicBlock *Op;
+
+  if (ParseToken(lltok::kw_within, "expected 'within' after reatach"))
+    return true;
+
+  if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar &&
+      Lex.getKind() != lltok::LocalVarID)
+    return TokError("expected scope value for reattach");
+
+  if (ParseValue(Type::getTokenTy(Context), SR, PFS))
+    return true;
+
+  if (ParseToken(lltok::comma, "expected ',' after reattach scope"))
+    return true;
+
+  if (ParseTypeAndBasicBlock(Op, Loc, PFS))
+    return true;
+
+  Inst = ReattachInst::Create(Op, SR);
+  return false;
+}
+
+/// ParseSync
+///   ::= 'sync' within SyncRegion ',' TypeAndValue
+bool LLParser::ParseSync(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *SR;
+  BasicBlock *Op;
+
+  if (ParseToken(lltok::kw_within, "expected 'within' after sync"))
+    return true;
+
+  if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar &&
+      Lex.getKind() != lltok::LocalVarID)
+    return TokError("expected scope value for reattach");
+
+  if (ParseValue(Type::getTokenTy(Context), SR, PFS))
+    return true;
+
+  if (ParseToken(lltok::comma, "expected ',' after scope in sync"))
+    return true;
+
+  if (ParseTypeAndBasicBlock(Op, Loc, PFS))
+    return true;
+
+  Inst = SyncInst::Create(Op, SR);
+  return false;
+}
+
 /// ParseSwitch
 ///  Instruction
 ///    ::= 'switch' TypeAndValue ',' TypeAndValue '[' JumpTable ']'
diff --git a/llvm/lib/AsmParser/LLParser.h b/llvm/lib/AsmParser/LLParser.h
index 5a0fc297265d4d..2b53bbea557b4d 100644
--- a/llvm/lib/AsmParser/LLParser.h
+++ b/llvm/lib/AsmParser/LLParser.h
@@ -571,6 +571,9 @@ namespace llvm {
     bool ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS);
     bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS);
     bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseDetach(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseReattach(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseSync(Instruction *&Inst, PerFunctionState &PFS);
 
     bool ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
                       unsigned OperandType);
diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h
index c2e2795a9467be..d21527f347a211 100644
--- a/llvm/lib/AsmParser/LLToken.h
+++ b/llvm/lib/AsmParser/LLToken.h
@@ -344,6 +344,11 @@ enum Kind {
   kw_insertvalue,
   kw_blockaddress,
 
+  // Tapir types
+  kw_detach,
+  kw_reattach,
+  kw_sync,
+
   // Metadata types.
   kw_distinct,
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index fe051e7a91256d..1173e1e8792616 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4231,6 +4231,59 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       I = new UnreachableInst(Context);
       InstructionList.push_back(I);
       break;
+    case bitc::FUNC_CODE_INST_DETACH: { // DETACH: [bb#, bb#, val]
+      if (Record.size() != 3)
+        return error("Invalid record");
+      BasicBlock *Detached = getBasicBlock(Record[0]);
+      if (!Detached)
+        return error("Invalid record");
+
+      BasicBlock *Continue = getBasicBlock(Record[1]);
+      if (!Continue)
+        return error("Invalid record");
+
+      Value *SyncRegion =
+        getValue(Record, 2, NextValueNo, Type::getTokenTy(Context));
+      if (!SyncRegion)
+        return error("Invalid record");
+
+      I = DetachInst::Create(Detached, Continue, SyncRegion);
+      InstructionList.push_back(I);
+      break;
+    }
+      case bitc::FUNC_CODE_INST_REATTACH: { // REATTACH: [bb#, val]
+      if (Record.size() != 2)
+        return error("Invalid record");
+
+      BasicBlock *DetachContinue = getBasicBlock(Record[0]);
+      if (!DetachContinue)
+        return error("Invalid record");
+
+      Value *SyncRegion =
+        getValue(Record, 1, NextValueNo, Type::getTokenTy(Context));
+      if (!SyncRegion)
+        return error("Invalid record");
+
+      I = ReattachInst::Create(DetachContinue, SyncRegion);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_SYNC: { // Sync: [bb#, val]
+      if (Record.size() != 1)
+        return error("Invalid record");
+      BasicBlock *Continue = getBasicBlock(Record[0]);
+      if (!Continue)
+        return error("Invalid record");
+
+      Value *SyncRegion =
+        getValue(Record, 1, NextValueNo, Type::getTokenTy(Context));
+      if (!SyncRegion)
+        return error("Invalid record");
+
+      I = SyncInst::Create(Continue, SyncRegion);
+      InstructionList.push_back(I);
+      break;
+    }
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
       if (Record.size() < 1 || ((Record.size()-1)&1))
         return error("Invalid record");
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index ba4f932e2e6db8..26d032ffe47c1d 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2780,6 +2780,31 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     Code = bitc::FUNC_CODE_INST_UNREACHABLE;
     AbbrevToUse = FUNCTION_INST_UNREACHABLE_ABBREV;
     break;
+  case Instruction::Detach:
+    {
+      Code = bitc::FUNC_CODE_INST_DETACH;
+      const DetachInst &DI = cast<DetachInst>(I);
+      Vals.push_back(VE.getValueID(DI.getSuccessor(0)));
+      Vals.push_back(VE.getValueID(DI.getSuccessor(1)));
+      pushValue(DI.getSyncRegion(), InstID, Vals);
+    }
+    break;
+  case Instruction::Reattach:
+    {
+      Code = bitc::FUNC_CODE_INST_REATTACH;
+      const ReattachInst &RI = cast<ReattachInst>(I);
+      Vals.push_back(VE.getValueID(RI.getSuccessor(0)));
+      pushValue(RI.getSyncRegion(), InstID, Vals);
+    }
+    break;
+  case Instruction::Sync:
+    {
+      Code = bitc::FUNC_CODE_INST_SYNC;
+      const SyncInst &SI = cast<SyncInst>(I);
+      Vals.push_back(VE.getValueID(SI.getSuccessor(0)));
+      pushValue(SI.getSyncRegion(), InstID, Vals);
+    }
+    break;
 
   case Instruction::PHI: {
     const PHINode &PN = cast<PHINode>(I);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 95f6274aa068be..a451527c5bb472 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -476,6 +476,62 @@ bool IRTranslator::translateIndirectBr(const User &U,
   return true;
 }
 
+bool IRTranslator::translateDetach(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  const DetachInst &DetInst = cast<DetachInst>(U);
+
+  // Lowering of Tapir instructions should have happened already.  At this
+  // stage, treat Detach like an unconditional branch to the detached successor.
+  const BasicBlock &DetTgt = *cast<BasicBlock>(DetInst.getDetached());
+  MachineBasicBlock &TgtBB = getMBB(DetTgt);
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+
+  // If the detached successor is the layout successor, fallthrough.
+  if (!CurBB.isLayoutSuccessor(&TgtBB))
+    MIRBuilder.buildBr(TgtBB);
+
+  // Link detached successor.
+  CurBB.addSuccessor(&getMBB(*cast<BasicBlock>(DetInst.getDetached())));
+  return true;
+}
+
+bool IRTranslator::translateReattach(const User &U,
+                                     MachineIRBuilder &MIRBuilder) {
+  const ReattachInst &ReatInst = cast<ReattachInst>(U);
+
+  // Lowering of Tapir instructions should have happened already.  At this
+  // stage, treat Reattach like an unconditional branch to its successor.
+  const BasicBlock &ReatTgt = *cast<BasicBlock>(ReatInst.getSuccessor(0));
+  MachineBasicBlock &TgtBB = getMBB(ReatTgt);
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+
+  // If the reattach successor is the layout successor, fallthrough.
+  if (!CurBB.isLayoutSuccessor(&TgtBB))
+    MIRBuilder.buildBr(TgtBB);
+
+  // Link the Reattach instruction's successor.
+  CurBB.addSuccessor(&getMBB(*cast<BasicBlock>(ReatInst.getSuccessor(0))));
+  return true;
+}
+
+bool IRTranslator::translateSync(const User &U, MachineIRBuilder &MIRBuilder) {
+  const SyncInst &SInst = cast<SyncInst>(U);
+
+  // Lowering of Tapir instructions should have happened already.  At this
+  // stage, treat Sync like an unconditional branch to its successor.
+  const BasicBlock &STgt = *cast<BasicBlock>(SInst.getSuccessor(0));
+  MachineBasicBlock &TgtBB = getMBB(STgt);
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+
+  // If the sync successor is the layout successor, fallthrough.
+  if (!CurBB.isLayoutSuccessor(&TgtBB))
+    MIRBuilder.buildBr(TgtBB);
+
+  // Link the Sync instruction's successor.
+  CurBB.addSuccessor(&getMBB(*cast<BasicBlock>(SInst.getSuccessor(0))));
+  return true;
+}
+
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
 
diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
index 542491eabbf29c..a6fef51aa3098d 100644
--- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -299,6 +299,16 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
     return NewDef->getOperand(0).getReg();
   }
 
+  static bool BlockReattaches(MachineBasicBlock *BB,
+                              MachineSSAUpdater *Updater) {
+    return false;
+  }
+
+  static bool BlockDetaches(MachineBasicBlock *BB,
+                            MachineSSAUpdater *Updater) {
+    return false;
+  }
+
   /// CreateEmptyPHI - Create a PHI instruction that defines a new register.
   /// Add it into the specified block and return the register.
   static unsigned CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
@@ -344,6 +354,12 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
   static unsigned GetPHIValue(MachineInstr *PHI) {
     return PHI->getOperand(0).getReg();
   }
+
+  static void MarkDetachedDef(unsigned Val, MachineBasicBlock *BB,
+                              MachineSSAUpdater *Updater) {
+    return;
+  }
+
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index cdc597db640166..43e4fd352c6ddb 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -555,6 +555,28 @@ bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr &MI,
   return false;
 }
 
+static inline bool hasSetJmpPred( MachineBasicBlock *bl0 ) {
+
+//    llvm::errs() << "<considering block>\n";
+//    bl0->dump();
+//    llvm::errs() << "</considering block>\n";
+
+    for( auto bl : bl0->predecessors() ) {    
+//      llvm::errs() << "  <foo>\n";
+    auto term = bl->getFirstTerminator();
+    while( term != bl->end() ) {
+      auto mc = (*term).getDesc();
+//      if (mc.Opcode != 777) continue;
+       if (mc.Opcode == 777) { return true; }
+//      llvm::errs() << "    flags:" << mc.Flags << " opc:" << mc.Opcode << "\n";
+//      term->dump();
+      term++;
+    }
+//      llvm::errs() << "  </foo>\n";
+    }
+    return false;
+}
+
 /// Get the sorted sequence of successors for this MachineBasicBlock, possibly
 /// computing it if it was not already cached.
 SmallVector<MachineBasicBlock *, 4> &
@@ -565,7 +587,7 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
   if (Succs != AllSuccessors.end())
     return Succs->second;
 
-  SmallVector<MachineBasicBlock *, 4> AllSuccs(MBB->succ_begin(),
+  SmallPtrSet<MachineBasicBlock *, 4> AllSuccs0(MBB->succ_begin(),
                                                MBB->succ_end());
 
   // Handle cases where sinking can happen but where the sink point isn't a
@@ -582,7 +604,43 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
     if (DTChild->getIDom()->getBlock() == MI.getParent() &&
         // Skip MBBs already added to the AllSuccs vector above.
         !MBB->isSuccessor(DTChild->getBlock()))
-      AllSuccs.push_back(DTChild->getBlock());
+      AllSuccs0.insert(DTChild->getBlock());
+
+  ///*
+  bool unstable = true;
+  while(unstable) {
+    unstable = false;
+    SmallPtrSet<MachineBasicBlock*, 10> toRemove;
+    for( auto bl0 : AllSuccs0 ) {
+      //if (hasSetJmpPred(bl0)) assert(bl0->hasAddressTaken());
+      if (toRemove.count(bl0) == 0 && (hasSetJmpPred(bl0) || bl0->hasAddressTaken()) ) {   
+        SmallVector<MachineBasicBlock *, 10> Q;
+        Q.push_back(bl0);
+        toRemove.insert(bl0);
+        while( Q.size() > 0 ) {
+          auto f = Q.back();
+          Q.pop_back();
+          //llvm::errs() << "saw and removing: " << f->getFullName() << "$BB#" << f->getNumber() << "\n";
+          for( auto a : f->successors() ) {
+            if ( toRemove.count(a) > 0 || AllSuccs0.count(a) == 0 ) continue;
+            toRemove.insert(a);
+            Q.push_back(a);
+          }
+        }
+        unstable = true;
+      }
+    }
+    for (auto b : toRemove) {
+      AllSuccs0.erase(b);
+    }
+  } // */
+
+  //MBB->dump();
+  //llvm::errs() << "CHECK CHILDREN FOR " << MBB->getFullName() << "$BB#" << MBB->getNumber() << ": " << "|{";
+  //for( auto a : AllSuccs0 ) llvm::errs() << a->getFullName() << "$BB#" << a->getNumber() << ",";
+  //llvm::errs() << "}\n";
+  SmallVector<MachineBasicBlock *, 4> AllSuccs(AllSuccs0.begin(),
+                                               AllSuccs0.end());
 
   // Sort Successors according to their loop depth or block frequency info.
   std::stable_sort(
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index bfeb3d1bc2b91f..7fa157cc1bac4b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2671,6 +2671,66 @@ void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
   DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
 }
 
+void SelectionDAGBuilder::visitDetach(const DetachInst &I) {
+  MachineBasicBlock *DetachMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges.
+  MachineBasicBlock *Detached = FuncInfo.MBBMap[I.getSuccessor(0)];
+  //MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(1)];
+
+  // Update machine-CFG edges.
+  DetachMBB->addSuccessor(Detached);
+
+  // If this is not a fall-through branch or optimizations are switched off,
+  // emit the branch.
+  if (Detached != NextBlock(DetachMBB) || TM.getOptLevel() == CodeGenOpt::None)
+    DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
+                            MVT::Other, getControlRoot(),
+                            DAG.getBasicBlock(Detached)));
+
+  return;
+
+}
+
+void SelectionDAGBuilder::visitReattach(const ReattachInst &I) {
+  MachineBasicBlock *ReattachMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges.
+  MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(0)];
+
+  // Update machine-CFG edges.
+  ReattachMBB->addSuccessor(Continue);
+
+  // If this is not a fall-through branch or optimizations are switched off,
+  // emit the branch.
+  if (Continue != NextBlock(ReattachMBB) || TM.getOptLevel() == CodeGenOpt::None)
+    DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
+                            MVT::Other, getControlRoot(),
+                            DAG.getBasicBlock(Continue)));
+
+  return;
+}
+
+void SelectionDAGBuilder::visitSync(const SyncInst &I) {
+  MachineBasicBlock *SyncMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges.
+  MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(0)];
+
+  // Update machine-CFG edges.
+  SyncMBB->addSuccessor(Continue);
+
+  // If this is not a fall-through branch or optimizations are switched off,
+  // emit the branch.
+  if (Continue != NextBlock(SyncMBB) || TM.getOptLevel() == CodeGenOpt::None)
+    DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
+                            MVT::Other, getControlRoot(),
+                            DAG.getBasicBlock(Continue)));
+
+  return;
+}
+
+
 void SelectionDAGBuilder::visitFSub(const User &I) {
   // -0.0 - X --> fneg
   Type *Ty = I.getType();
@@ -6375,6 +6435,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     // MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely
     // delete it now.
     return nullptr;
+  // Tapir intrinsics
+  // Lower the starting point of a sync region to a no-op.
+  case Intrinsic::syncregion_start:
+    return nullptr;
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 5f9cdb69daf72d..b0cc4725884aa8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -57,6 +57,7 @@ class ConstantInt;
 class ConstrainedFPIntrinsic;
 class DbgValueInst;
 class DataLayout;
+class DetachInst;
 class DIExpression;
 class DILocalVariable;
 class DILocation;
@@ -72,11 +73,13 @@ class LLVMContext;
 class LoadInst;
 class MachineBasicBlock;
 class PHINode;
+class ReattachInst;
 class ResumeInst;
 class ReturnInst;
 class SDDbgValue;
 class StoreInst;
 class SwitchInst;
+class SyncInst;
 class TargetLibraryInfo;
 class TargetMachine;
 class Type;
@@ -825,6 +828,9 @@ class SelectionDAGBuilder {
   void visitCatchRet(const CatchReturnInst &I);
   void visitCatchPad(const CatchPadInst &I);
   void visitCleanupPad(const CleanupPadInst &CPI);
+  void visitDetach(const DetachInst& I);
+  void visitReattach(const ReattachInst& I);
+  void visitSync(const SyncInst& I);
 
   BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
                                        const MachineBasicBlock *Dst) const;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index e8619037564245..4edebace9622ea 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1455,6 +1455,9 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   case CatchSwitch:    return 0;
   case CleanupPad:     return 0;
   case FNeg:           return ISD::FNEG;
+  case Detach:         return 0;
+  case Reattach:       return 0;
+  case Sync:           return 0;
   case Add:            return ISD::ADD;
   case FAdd:           return ISD::FADD;
   case Sub:            return ISD::SUB;
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index a5dc623e1a30fe..adead5e5dc1d62 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -3637,6 +3637,29 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     writeOperand(BI.getSuccessor(0), true);
     Out << ", ";
     writeOperand(BI.getSuccessor(1), true);
+  } else if (isa<DetachInst>(I)) {
+    // Special case detach instruction to get formatting nice and correct
+    const DetachInst &DI(cast<DetachInst>(I));
+    Out << " within ";
+    writeOperand(DI.getSyncRegion(), /*PrintType=*/false);
+    Out << ", ";
+    writeOperand(DI.getDetached(), true);
+    Out << ", ";
+    writeOperand(DI.getContinue(), true);
+  } else if (isa<ReattachInst>(I)) {
+    // Special case reattach instruction to get formatting nice and correct
+    const ReattachInst &RI(cast<ReattachInst>(I));
+    Out << " within ";
+    writeOperand(RI.getSyncRegion(), /*PrintType=*/false);
+    Out << ", ";
+    writeOperand(RI.getSuccessor(0), true);
+  } else if (isa<SyncInst>(I)) {
+    // Special case sync instruction to get formatting nice and correct
+    const SyncInst &SI(cast<SyncInst>(I));
+    Out << " within ";
+    writeOperand(SI.getSyncRegion(), /*PrintType=*/false);
+    Out << ", ";
+    writeOperand(SI.getSuccessor(0), true);
 
   } else if (isa<SwitchInst>(I)) {
     const SwitchInst& SI(cast<SwitchInst>(I));
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 375924360dda83..213c8deedc0bd7 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -443,6 +443,48 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
   return New;
 }
 
+BasicBlock *BasicBlock::splitBasicBlockWithTerminator(const Twine &BBName) {
+  auto term = getTerminator();
+  assert(term && "Can't use splitBasicBlock on degenerate BB!");
+  assert(term->getNumSuccessors() == 1 && "Number of successors must be 1");
+
+  BasicBlock *New = BasicBlock::Create(getContext(), BBName, getParent(),
+                                       this->getNextNode());
+
+  // Save DebugLoc of split point before invalidating iterator.
+  DebugLoc Loc = term->getDebugLoc();
+  // Move all of the specified instructions from the original basic block into
+  // the new basic block.
+  auto suc = term->getSuccessor(0);
+  term->setSuccessor(0, New);
+
+  // Add a branch instruction to the newly formed basic block.
+  BranchInst *BI = BranchInst::Create(suc, New);
+  BI->setDebugLoc(Loc);
+
+  // Now we must loop through all of the successors of the New block (which
+  // _were_ the successors of the 'this' block), and update any PHI nodes in
+  // successors.  If there were PHI nodes in the successors, then they need to
+  // know that incoming branches will be from New, not from Old.
+  //
+  for (succ_iterator I = succ_begin(New), E = succ_end(New); I != E; ++I) {
+    // Loop over any phi nodes in the basic block, updating the BB field of
+    // incoming values...
+    BasicBlock *Successor = *I;
+    PHINode *PN;
+    for (BasicBlock::iterator II = Successor->begin();
+         (PN = dyn_cast<PHINode>(II)); ++II) {
+      int IDX = PN->getBasicBlockIndex(this);
+      while (IDX != -1) {
+        PN->setIncomingBlock((unsigned)IDX, New);
+        IDX = PN->getBasicBlockIndex(this);
+      }
+    }
+  }
+
+  return New;
+}
+
 void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) {
   Instruction *TI = getTerminator();
   if (!TI)
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index d861b5288592ca..57d3923622991b 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -302,6 +302,9 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case CatchRet: return "catchret";
   case CatchPad: return "catchpad";
   case CatchSwitch: return "catchswitch";
+  case Detach: return "detach";
+  case Reattach: return "reattach";
+  case Sync:   return "sync";
 
   // Standard unary operators...
   case FNeg: return "fneg";
@@ -510,6 +513,7 @@ bool Instruction::mayReadFromMemory() const {
   case Instruction::VAArg:
   case Instruction::Load:
   case Instruction::Fence: // FIXME: refine definition of mayReadFromMemory
+  case Instruction::Sync: // Like Instruction::Fence
   case Instruction::AtomicCmpXchg:
   case Instruction::AtomicRMW:
   case Instruction::CatchPad:
@@ -528,6 +532,7 @@ bool Instruction::mayWriteToMemory() const {
   switch (getOpcode()) {
   default: return false;
   case Instruction::Fence: // FIXME: refine definition of mayWriteToMemory
+  case Instruction::Sync: // Like Instruction::Fence
   case Instruction::Store:
   case Instruction::VAArg:
   case Instruction::AtomicCmpXchg:
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 06b46724a87f80..81bb40423e8234 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -973,6 +973,180 @@ UnreachableInst::UnreachableInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
     : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr,
                   0, InsertAtEnd) {}
 
+//===----------------------------------------------------------------------===//
+//                        DetachInst Implementation
+//===----------------------------------------------------------------------===//
+
+void DetachInst::AssertOK() {
+  assert(getSyncRegion()->getType()->isTokenTy() &&
+         "Sync region must be a token!");
+}
+
+DetachInst::DetachInst(BasicBlock *Detached, BasicBlock *Continue,
+                       Value *SyncRegion,
+                       Instruction *InsertBefore)
+    : TerminatorInst(Type::getVoidTy(Detached->getContext()),
+                     Instruction::Detach,
+                     OperandTraits<DetachInst>::op_end(this) - 3, 3,
+                     InsertBefore) {
+  Op<-1>() = Detached;
+  Op<-2>() = Continue;
+  Op<-3>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+DetachInst::DetachInst(BasicBlock *Detached, BasicBlock *Continue,
+                       Value *SyncRegion,
+                       BasicBlock *InsertAtEnd)
+    : TerminatorInst(Type::getVoidTy(Detached->getContext()),
+                     Instruction::Detach,
+                     OperandTraits<DetachInst>::op_end(this) - 3, 3,
+                     InsertAtEnd) {
+  Op<-1>() = Detached;
+  Op<-2>() = Continue;
+  Op<-3>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+
+DetachInst::DetachInst(const DetachInst &DI)
+    : TerminatorInst(Type::getVoidTy(DI.getContext()), Instruction::Detach,
+                     OperandTraits<DetachInst>::op_end(this) -
+                     DI.getNumOperands(),
+                     DI.getNumOperands()) {
+  Op<-1>() = DI.Op<-1>();
+  Op<-2>() = DI.Op<-2>();
+  Op<-3>() = DI.Op<-3>();
+  assert(DI.getNumOperands() == 3 && "Detach must have 3 operands!");
+  SubclassOptionalData = DI.SubclassOptionalData;
+}
+
+BasicBlock *DetachInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned DetachInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void DetachInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
+//===----------------------------------------------------------------------===//
+//                      ReattachInst Implementation
+//===----------------------------------------------------------------------===//
+
+void ReattachInst::AssertOK() {
+  assert(getSyncRegion()->getType()->isTokenTy() &&
+         "Sync region must be a token!");
+}
+
+ReattachInst::ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion,
+                           Instruction *InsertBefore)
+    : TerminatorInst(Type::getVoidTy(DetachContinue->getContext()),
+                     Instruction::Reattach,
+                     OperandTraits<ReattachInst>::op_end(this) - 2, 2,
+                     InsertBefore) {
+  Op<-1>() = DetachContinue;
+  Op<-2>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+ReattachInst::ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion,
+                           BasicBlock *InsertAtEnd)
+    : TerminatorInst(Type::getVoidTy(DetachContinue->getContext()),
+                     Instruction::Reattach,
+                     OperandTraits<ReattachInst>::op_end(this) - 2, 2,
+                     InsertAtEnd) {
+  Op<-1>() = DetachContinue;
+  Op<-2>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+ReattachInst::ReattachInst(const ReattachInst &RI)
+    : TerminatorInst(Type::getVoidTy(RI.getContext()), Instruction::Reattach,
+                     OperandTraits<ReattachInst>::op_end(this) -
+                     RI.getNumOperands(),
+                     RI.getNumOperands()) {
+  Op<-1>() = RI.Op<-1>();
+  Op<-2>() = RI.Op<-2>();
+  assert(RI.getNumOperands() == 2 && "Reattach must have 2 operands!");
+  SubclassOptionalData = RI.SubclassOptionalData;
+}
+
+unsigned ReattachInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+
+BasicBlock *ReattachInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+
+void ReattachInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
+//===----------------------------------------------------------------------===//
+//                        SyncInst Implementation
+//===----------------------------------------------------------------------===//
+
+void SyncInst::AssertOK() {
+  assert(getSyncRegion()->getType()->isTokenTy() &&
+         "Sync region must be a token!");
+}
+
+SyncInst::SyncInst(BasicBlock *Continue, Value *SyncRegion,
+                   Instruction *InsertBefore)
+    : TerminatorInst(Type::getVoidTy(Continue->getContext()), Instruction::Sync,
+                     OperandTraits<SyncInst>::op_end(this) - 2, 2,
+                     InsertBefore) {
+  Op<-1>() = Continue;
+  Op<-2>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+SyncInst::SyncInst(BasicBlock *Continue, Value *SyncRegion,
+                   BasicBlock *InsertAtEnd)
+    : TerminatorInst(Type::getVoidTy(Continue->getContext()), Instruction::Sync,
+                     OperandTraits<SyncInst>::op_end(this) - 2, 2,
+                     InsertAtEnd) {
+  Op<-1>() = Continue;
+  Op<-2>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+
+SyncInst::SyncInst(const SyncInst &SI) :
+    TerminatorInst(Type::getVoidTy(SI.getContext()), Instruction::Sync,
+                   OperandTraits<SyncInst>::op_end(this) - SI.getNumOperands(),
+                   SI.getNumOperands()) {
+  Op<-1>() = SI.Op<-1>();
+  Op<-2>() = SI.Op<-2>();
+  assert(SI.getNumOperands() == 2 && "Sync must have 2 operands!");
+  SubclassOptionalData = SI.SubclassOptionalData;
+}
+
+BasicBlock *SyncInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned SyncInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void SyncInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
 //===----------------------------------------------------------------------===//
 //                        BranchInst Implementation
 //===----------------------------------------------------------------------===//
@@ -4000,3 +4174,15 @@ UnreachableInst *UnreachableInst::cloneImpl() const {
   LLVMContext &Context = getContext();
   return new UnreachableInst(Context);
 }
+
+DetachInst *DetachInst::cloneImpl() const {
+  return new(getNumOperands()) DetachInst(*this);
+}
+
+ReattachInst *ReattachInst::cloneImpl() const {
+  return new(getNumOperands()) ReattachInst(*this);
+}
+
+SyncInst *SyncInst::cloneImpl() const {
+  return new(getNumOperands()) SyncInst(*this);
+}
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 0fb079c5ab7395..6c7255f4319e50 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -366,6 +366,13 @@ StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes,
   return ST;
 }
 
+StructType *StructType::getOrCreate(LLVMContext &Context, StringRef Name) {
+  StructType *Ty = Context.pImpl->NamedStructTypes.lookup(Name);
+  if (!Ty)
+    Ty = StructType::create(Context, Name);
+  return Ty;
+}
+
 void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) {
   assert(isOpaque() && "Struct body already set!");
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 30e77b92009f0f..00fdc08b066e2c 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -105,6 +105,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+//#include "llvm/Transforms/Tapir/CilkABI.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -342,6 +343,12 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
         BB.printAsOperand(*OS, true, MST);
         *OS << "\n";
       }
+      // if (const DetachInst* Det = dyn_cast<DetachInst>(&I->back())) {
+      //   if (!cilk::verifyDetachedCFG(*Det, DT)) {
+      //     OS << "Invalid end to detached CFG\n";
+      //     return true;
+      //   }
+      // }
       return false;
     }
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 5ec94ea6f40ab0..2d935a1074dd88 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/DetachSSA.h"
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVUsers.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 771d2f5b212ae9..c24cf8e33375d1 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -112,6 +112,7 @@ FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis())
 FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
 FUNCTION_ANALYSIS("postdomtree", PostDominatorTreeAnalysis())
 FUNCTION_ANALYSIS("demanded-bits", DemandedBitsAnalysis())
+FUNCTION_ANALYSIS("detachssa", DetachSSAAnalysis())
 FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
 FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
@@ -202,6 +203,7 @@ FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
 FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(dbgs()))
 FUNCTION_PASS("print<branch-prob>", BranchProbabilityPrinterPass(dbgs()))
 FUNCTION_PASS("print<da>", DependenceAnalysisPrinterPass(dbgs()))
+FUNCTION_PASS("print<detachssa>", DetachSSAPrinterPass(dbgs()))
 FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
 FUNCTION_PASS("print<postdomtree>", PostDominatorTreePrinterPass(dbgs()))
 FUNCTION_PASS("print<demanded-bits>", DemandedBitsPrinterPass(dbgs()))
@@ -224,6 +226,7 @@ FUNCTION_PASS("sroa", SROA())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("verify", VerifierPass())
+FUNCTION_PASS("verify<detachssa>", DetachSSAVerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
 FUNCTION_PASS("verify<memoryssa>", MemorySSAVerifierPass())
diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt
index 74db9e53304da9..c39bc7e368d379 100644
--- a/llvm/lib/Transforms/CMakeLists.txt
+++ b/llvm/lib/Transforms/CMakeLists.txt
@@ -8,3 +8,4 @@ add_subdirectory(Vectorize)
 add_subdirectory(Hello)
 add_subdirectory(ObjCARC)
 add_subdirectory(Coroutines)
+add_subdirectory(Tapir)
diff --git a/llvm/lib/Transforms/IPO/LLVMBuild.txt b/llvm/lib/Transforms/IPO/LLVMBuild.txt
index 54ce23876e66b4..e0d6b8353fc3a7 100644
--- a/llvm/lib/Transforms/IPO/LLVMBuild.txt
+++ b/llvm/lib/Transforms/IPO/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
 name = IPO
 parent = Transforms
 library_name = ipo
-required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation
+required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TapirOpts TransformUtils Vectorize Instrumentation
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 9764944dc3329e..6f0c86f64fd304 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -41,6 +41,8 @@
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Tapir.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Vectorize.h"
 
 using namespace llvm;
@@ -100,6 +102,10 @@ static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
                                         cl::init(false), cl::Hidden,
                                         cl::desc("Enable Unroll And Jam Pass"));
 
+static cl::opt<bool> EnableLoopFuse(
+    "enable-loop-fuse", cl::init(false), cl::Hidden,
+    cl::desc("Enable the new, experimental LoopFusion Pass"));
+
 static cl::opt<bool>
     EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
                             cl::desc("Enable preparation for ThinLTO."));
@@ -161,8 +167,11 @@ static cl::opt<bool>
               cl::desc("Enable control height reduction optimization (CHR)"));
 
 PassManagerBuilder::PassManagerBuilder() {
+    InstrumentCilk = false;
     OptLevel = 2;
     SizeLevel = 0;
+    ParallelLevel = 0;
+    Rhino = false;
     LibraryInfo = nullptr;
     Inliner = nullptr;
     DisableUnrollLoops = false;
@@ -423,6 +432,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createControlHeightReductionLegacyPass());
 }
 
+// void PassManagerBuilder::prepopulateModulePassManager(
 void PassManagerBuilder::populateModulePassManager(
     legacy::PassManagerBase &MPM) {
   if (!PGOSampleUse.empty()) {
@@ -442,6 +452,15 @@ void PassManagerBuilder::populateModulePassManager(
       Inliner = nullptr;
     }
 
+    if (ParallelLevel > 0) {
+      MPM.add(createInferFunctionAttrsLegacyPass());
+      // MPM.add(createUnifyFunctionExitNodesPass());
+      MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk));
+      // The lowering pass may leave cruft around.  Clean it up.
+      MPM.add(createCFGSimplificationPass());
+      MPM.add(createInferFunctionAttrsLegacyPass());
+    }
+
     // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly
     // creates a CGSCC pass manager, but we don't want to add extensions into
     // that pass manager. To prevent this we insert a no-op module pass to reset
@@ -498,6 +517,15 @@ void PassManagerBuilder::populateModulePassManager(
   if (PrepareForThinLTOUsingPGOSampleProfile)
     DisableUnrollLoops = true;
 
+  bool RerunAfterTapirLowering = false;
+  bool TapirHasBeenLowered = (ParallelLevel == 0);
+  if (ParallelLevel == 3) // -fdetach
+    MPM.add(createLowerTapirToCilkPass(false, InstrumentCilk));
+
+  do {
+    RerunAfterTapirLowering =
+       !TapirHasBeenLowered && (ParallelLevel > 0) && !PrepareForThinLTO;
+      
   // Infer attributes about declarations if possible.
   MPM.add(createInferFunctionAttrsLegacyPass());
 
@@ -745,6 +773,45 @@ void PassManagerBuilder::populateModulePassManager(
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
   MPM.add(createCFGSimplificationPass());
 
+  if (RerunAfterTapirLowering || (ParallelLevel == 0))
+    // Add passes to run just before Tapir lowering.
+    addExtensionsToPM(EP_TapirLate, MPM);
+
+  if (!TapirHasBeenLowered) {
+    // First handle Tapir loops.
+    MPM.add(createIndVarSimplifyPass());
+
+    // Re-rotate loops in all our loop nests. These may have fallout out of
+    // rotated form due to GVN or other transformations, and loop spawning
+    // relies on the rotated form.  Disable header duplication at -Oz.
+    MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+
+    MPM.add(createLoopSpawningPass());
+
+    // The LoopSpawning pass may leave cruft around.  Clean it up.
+    MPM.add(createLoopDeletionPass());
+    MPM.add(createCFGSimplificationPass());
+    addInstructionCombiningPass(MPM);
+    addExtensionsToPM(EP_Peephole, MPM);
+
+    // Now lower Tapir to Cilk runtime calls.
+    //
+    // TODO: Make this sequence of passes check the library info for the Cilk
+    // RTS.
+
+    MPM.add(createInferFunctionAttrsLegacyPass());
+    // MPM.add(createUnifyFunctionExitNodesPass());
+    MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk));
+    // The lowering pass may leave cruft around.  Clean it up.
+    MPM.add(createCFGSimplificationPass());
+    MPM.add(createInferFunctionAttrsLegacyPass());
+    MPM.add(createMergeFunctionsPass());
+    MPM.add(createBarrierNoopPass());
+
+    TapirHasBeenLowered = true;
+  }
+  } while (RerunAfterTapirLowering);
+
   addExtensionsToPM(EP_OptimizerLast, MPM);
 
   if (PrepareForLTO) {
@@ -754,6 +821,58 @@ void PassManagerBuilder::populateModulePassManager(
   }
 }
 
+// void PassManagerBuilder::populateModulePassManager(legacy::PassManagerBase& MPM) {
+//   if (ParallelLevel != 0) {
+//     switch (ParallelLevel) {
+//       case 1: //fcilkplus
+//       case 2: //ftapir
+//         prepopulateModulePassManager(MPM);
+//         addExtensionsToPM(EP_TapirLate, MPM);
+//         break;
+//       case 3: //fdetach
+//         MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk));
+//         prepopulateModulePassManager(MPM);
+//         addExtensionsToPM(EP_TapirLate, MPM);
+//         break;
+//       case 0: llvm_unreachable("invalid");
+//     }
+
+//     MPM.add(createBarrierNoopPass());
+
+//     if (OptLevel > 0) {
+//       MPM.add(createIndVarSimplifyPass());
+
+//       // Re-rotate loops in all our loop nests. These may have fallout out of
+//       // rotated form due to GVN or other transformations, and loop spawning
+//       // relies on the rotated form.  Disable header duplication at -Oz.
+//       MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+
+//       MPM.add(createLoopSpawningPass());
+
+//       // The LoopSpawning pass may leave cruft around.  Clean it up.
+//       MPM.add(createLoopDeletionPass());
+//       MPM.add(createCFGSimplificationPass());
+//       addInstructionCombiningPass(MPM);
+//       addExtensionsToPM(EP_Peephole, MPM);
+//     }
+
+//     // if (ParallelLevel != 3) MPM.add(createInferFunctionAttrsLegacyPass());
+//     MPM.add(createInferFunctionAttrsLegacyPass());
+//     MPM.add(createUnifyFunctionExitNodesPass());
+//     MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk));
+//     // The lowering pass may leave cruft around.  Clean it up.
+//     MPM.add(createCFGSimplificationPass());
+//     // if (ParallelLevel != 3) MPM.add(createInferFunctionAttrsLegacyPass());
+//     MPM.add(createInferFunctionAttrsLegacyPass());
+//     if (OptLevel != 0) MPM.add(createMergeFunctionsPass());
+//     MPM.add(createBarrierNoopPass());
+//   }
+//   prepopulateModulePassManager(MPM);
+//   if (ParallelLevel == 0)
+//     addExtensionsToPM(EP_TapirLate, MPM);
+//   addExtensionsToPM(EP_OptimizerLast, MPM);
+// }
+
 void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // Load sample profile before running the LTO optimization pipeline.
   if (!PGOSampleUse.empty()) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index aeb25d530d71b3..3f15930c467c57 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3908,6 +3908,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
                                   Intrinsic::lifetime_end, *this))
       return nullptr;
     break;
+  case Intrinsic::syncregion_start: {
+    int NumUsers = 0;
+    for (User *U : II->users())
+      if (isa<DetachInst>(U) || isa<ReattachInst>(U) || isa<SyncInst>(U))
+        ++NumUsers;
+    if (!NumUsers)
+      return eraseInstFromFunction(CI);
+    break;
+  }
   case Intrinsic::assume: {
     Value *IIOperand = II->getArgOperand(0);
     // Remove an assume if it is followed by an identical assume.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 76ab614090faa8..c1fe6ff1c54ae3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1539,6 +1539,7 @@ bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) {
   if (StoreBB == DestBB || OtherBB == DestBB)
     return false;
 
+  assert(OtherBB);
   // Verify that the other block ends in a branch and is not otherwise empty.
   BasicBlock::iterator BBI(OtherBB->getTerminator());
   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index fef051aa1b7c35..421d4346c4593c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3085,6 +3085,11 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   // We can only sink load instructions if there is nothing between the load and
   // the end of block that could change the value.
   if (I->mayReadFromMemory()) {
+    // We can't generally move an instruction that reads from memory past a
+    // detach or reattach.
+    if (isa<DetachInst>(I->getParent()->getTerminator()) ||
+        isa<ReattachInst>(I->getParent()->getTerminator()))
+      return false;
     for (BasicBlock::iterator Scan = I->getIterator(),
                               E = I->getParent()->end();
          Scan != E; ++Scan)
@@ -3185,8 +3190,10 @@ bool InstCombiner::run() {
 
         // If the user is one of our immediate successors, and if that successor
         // only has us as a predecessors (we'd have to split the critical edge
-        // otherwise), we can keep going.
-        if (UserIsSuccessor && UserParent->getUniquePredecessor()) {
+        // otherwise), we can keep going.  Don't do this if the successor
+        // follows through a sync instruction, because that's a pessimization.
+        if (UserIsSuccessor && UserParent->getUniquePredecessor() &&
+            !isa<SyncInst>(BB->getTerminator())) {
           // Okay, the CFG is simple enough, try to sink this instruction.
           if (TryToSinkInstruction(I, UserParent)) {
             LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index f1558c75cb90bf..dd57e8a31e9587 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1194,6 +1194,11 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
   if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())
     return PreviouslySeenAllocaInfo->getSecond();
 
+  bool FunctionContainsDetach = false;
+  {
+    for (const BasicBlock &BB : *(AI.getParent()->getParent()))
+      FunctionContainsDetach |= isa<DetachInst>(BB.getTerminator());
+  }
   bool IsInteresting =
       (AI.getAllocatedType()->isSized() &&
        // alloca() may be called with 0 size, ignore it.
@@ -1201,6 +1206,8 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
        // We are only interested in allocas not promotable to registers.
        // Promotable allocas are common under -O0.
        (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
+       (!ClSkipPromotableAllocas ||
+        (!FunctionContainsDetach || !isAllocaParallelPromotable(&AI, *DT))) &&
        // inalloca allocas are not treated as static, and we don't want
        // dynamic alloca instrumentation for them as well.
        !AI.isUsedWithInAlloca() &&
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 94461849d5094e..0b41031ae18280 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMInstrumentation
   BoundsChecking.cpp
   CGProfile.cpp
   ControlHeightReduction.cpp
+  CilkSanitizer.cpp
   DataFlowSanitizer.cpp
   GCOVProfiling.cpp
   MemorySanitizer.cpp
@@ -15,6 +16,7 @@ add_llvm_library(LLVMInstrumentation
   ThreadSanitizer.cpp
   EfficiencySanitizer.cpp
   HWAddressSanitizer.cpp
+  ComprehensiveStaticInstrumentation.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp
new file mode 100644
index 00000000000000..62b3e0b1ed5710
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp
@@ -0,0 +1,1164 @@
+//===- CilkSanitizer.cpp - determinacy race detector for Cilk/Tapir -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of CilkSan, a determinacy race detector for Cilk
+// programs.
+//
+// This instrumentation pass inserts calls to the runtime library before
+// appropriate memory accesses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/DetachSSA.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Transforms/CSI.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cilksan"
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
+STATISTIC(NumOmittedReadsBeforeWrite,
+          "Number of reads ignored due to following writes");
+STATISTIC(NumOmittedReadsFromConstants,
+          "Number of reads from constant data");
+STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing");
+STATISTIC(NumInstrumentedDetaches, "Number of instrumented detaches");
+STATISTIC(NumInstrumentedDetachExits, "Number of instrumented detach exits");
+STATISTIC(NumInstrumentedSyncs, "Number of instrumented syncs");
+
+static const char *const CsanDetachBaseIdName = "__csan_unit_detach_base_id";
+static const char *const CsanTaskBaseIdName = "__csan_unit_task_base_id";
+static const char *const CsanTaskExitBaseIdName =
+  "__csan_unit_task_exit_base_id";
+static const char *const CsanDetachContinueBaseIdName =
+  "__csan_unit_detach_continue_base_id";
+static const char *const CsanSyncBaseIdName = "__csan_unit_sync_base_id";
+static const char *const CsiUnitObjTableName = "__csi_unit_obj_table";
+static const char *const CsiUnitObjTableArrayName = "__csi_unit_obj_tables";
+
+/// Maintains a mapping from CSI ID of a load or store to the source information
+/// of the object accessed by that load or store.
+class ObjectTable : public ForensicTable {
+public:
+  ObjectTable() : ForensicTable() {}
+  ObjectTable(Module &M, StringRef BaseIdName)
+      : ForensicTable(M, BaseIdName) {}
+
+  /// The number of entries in this table
+  uint64_t size() const { return LocalIdToSourceLocationMap.size(); }
+
+  /// Add the given instruction to this table.
+  /// \returns The local ID of the Instruction.
+  uint64_t add(Instruction &I, Value *Addr, const DataLayout &DL);
+
+  /// Get the Type for a pointer to a table entry.
+  ///
+  /// A table entry is just a source location.
+  static PointerType *getPointerType(LLVMContext &C);
+
+  /// Insert this table into the given Module.
+  ///
+  /// The table is constructed as a ConstantArray indexed by local IDs.  The
+  /// runtime is responsible for performing the mapping that allows the table to
+  /// be indexed by global ID.
+  Constant *insertIntoModule(Module &M) const;
+
+private:
+  struct SourceLocation {
+    StringRef Name;
+    int32_t Line;
+    StringRef Filename;
+    StringRef Directory;
+  };
+
+  /// Map of local ID to SourceLocation.
+  DenseMap<uint64_t, SourceLocation> LocalIdToSourceLocationMap;
+
+  /// Create a struct type to match the "struct SourceLocation" type.
+  /// (and the source_loc_t type in csi.h).
+  static StructType *getSourceLocStructType(LLVMContext &C);
+
+  /// Append the line and file information to the table.
+  void add(uint64_t ID, int32_t Line = -1,
+           StringRef Filename = "", StringRef Directory = "",
+           StringRef Name = "");
+};
+
+namespace {
+
+struct CilkSanitizerImpl : public CSIImpl {
+  // CilkSanitizerImpl(Module &M, CallGraph *CG,
+  //                   function_ref<DetachSSA &(Function &)> GetDSSA,
+  //                   function_ref<MemorySSA &(Function &)> GetMSSA)
+  //     : CSIImpl(M, CG), GetDSSA(GetDSSA), GetMSSA(GetMSSA) {
+  CilkSanitizerImpl(Module &M, CallGraph *CG,
+                    function_ref<DominatorTree &(Function &)> GetDomTree,
+                    const TargetLibraryInfo *TLI)
+      : CSIImpl(M, CG), GetDomTree(GetDomTree), TLI(TLI),
+        CsanFuncEntry(nullptr), CsanFuncExit(nullptr), CsanRead(nullptr),
+        CsanWrite(nullptr), CsanDetach(nullptr), CsanDetachContinue(nullptr),
+        CsanTaskEntry(nullptr), CsanTaskExit(nullptr), CsanSync(nullptr) {
+    // Even though we're doing our own instrumentation, we want the CSI setup
+    // for the instrumentation of function entry/exit, memory accesses (i.e.,
+    // loads and stores), atomics, memory intrinsics.  We also want call sites,
+    // for extracting debug information.
+    Options.InstrumentBasicBlocks = false;
+    // Options.InstrumentCalls = false;
+    Options.InstrumentMemoryAccesses = false;
+    Options.InstrumentMemIntrinsics = false;
+  }
+  bool run();
+
+  static StructType *getUnitObjTableType(LLVMContext &C,
+                                         PointerType *EntryPointerType);
+  static Constant *objTableToUnitObjTable(Module &M,
+                                          StructType *UnitObjTableType,
+                                          ObjectTable &ObjTable);
+
+  // Methods for handling FED tables
+  void initializeCsanFEDTables();
+  void collectUnitFEDTables();
+
+  // Methods for handling object tables
+  void initializeCsanObjectTables();
+  void collectUnitObjectTables();
+
+  CallInst *createRTUnitInitCall(IRBuilder<> &IRB) override;
+
+  // Initialize custom hooks for CilkSanitizer
+  void initializeCsanHooks();
+
+  // Insert hooks at relevant program points
+  bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
+  bool instrumentAtomic(Instruction *I, const DataLayout &DL);
+  bool instrumentMemIntrinsic(Instruction *I, const DataLayout &DL);
+  bool instrumentCallsite(Instruction *I, DominatorTree *DT);
+  bool instrumentDetach(DetachInst *DI, DominatorTree *DT);
+  bool instrumentSync(SyncInst *SI);
+  bool instrumentFunction(Function &F);
+  void chooseInstructionsToInstrument(
+      SmallVectorImpl<Instruction *> &Local,
+      SmallVectorImpl<Instruction *> &All,
+      const DataLayout &DL);
+
+private:
+  // Analysis results
+  // function_ref<DetachSSA &(Function &)> GetDSSA;
+  // function_ref<MemorySSA &(Function &)> GetMSSA;
+  function_ref<DominatorTree &(Function &)> GetDomTree;
+  const TargetLibraryInfo *TLI;
+
+  // Instrumentation hooks
+  Function *CsanFuncEntry, *CsanFuncExit; 
+  Function *CsanRead, *CsanWrite;
+  Function *CsanLargeRead, *CsanLargeWrite;
+  Function *CsanDetach, *CsanDetachContinue;
+  Function *CsanTaskEntry, *CsanTaskExit;
+  Function *CsanSync;
+
+  // CilkSanitizer FED tables
+  FrontEndDataTable DetachFED, TaskFED, TaskExitFED, DetachContinueFED,
+    SyncFED; 
+
+  // CilkSanitizer custom forensic tables
+  ObjectTable LoadObj, StoreObj;
+
+  SmallVector<Constant *, 2> UnitObjTables;
+
+};
+
+/// CilkSanitizer: instrument the code in module to find races.
+struct CilkSanitizer : public ModulePass {
+  static char ID;  // Pass identification, replacement for typeid.
+  CilkSanitizer() : ModulePass(ID) {
+    initializeCilkSanitizerPass(*PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override {
+    return "CilkSanitizer";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnModule(Module &M);
+};
+} // namespace
+
+char CilkSanitizer::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+    CilkSanitizer, "csan",
+    "CilkSanitizer: detects determinacy races in Cilk programs.",
+    false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+// INITIALIZE_PASS_DEPENDENCY(DetachSSAWrapperPass)
+// INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(
+    CilkSanitizer, "csan",
+    "CilkSanitizer: detects determinacy races in Cilk programs.",
+    false, false)
+
+void CilkSanitizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<CallGraphWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  // AU.addRequired<DetachSSAWrapperPass>();
+  // AU.addRequired<MemorySSAWrapperPass>();
+}
+
+ModulePass *llvm::createCilkSanitizerPass() {
+  return new CilkSanitizer();
+}
+
+uint64_t ObjectTable::add(Instruction &I,
+                          Value *Addr,
+                          const DataLayout &DL) {
+  uint64_t ID = getId(&I);
+  Value *Obj = GetUnderlyingObject(Addr, DL);
+
+  // First, if the underlying object is a global variable, get that variable's
+  // debug information.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Obj)) {
+    SmallVector<DIGlobalVariableExpression *, 1> DbgGVExprs;
+    GV->getDebugInfo(DbgGVExprs);
+    for (auto *GVE : DbgGVExprs) {
+      auto *DGV = GVE->getVariable();
+      if (DGV->getName() != "") {
+        add(ID, DGV->getLine(), DGV->getFilename(), DGV->getDirectory(),
+            DGV->getName());
+        return ID;
+      }
+    }
+    add(ID);
+    return ID;
+  }
+
+  // Next, if this is an alloca instruction, look for a llvm.dbg.declare
+  // intrinsic.
+  if (isa<AllocaInst>(Obj)) {
+    if (auto *DDI = FindAllocaDbgDeclare(Obj)) {
+      auto *LV = DDI->getVariable();
+      if (LV->getName() != "") {
+        add(ID, LV->getLine(), LV->getFilename(), LV->getDirectory(),
+            LV->getName());
+        return ID;
+      }
+    }
+  }
+
+  // Otherwise just examine the llvm.dbg.value intrinsics for this object.
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  findDbgValues(DbgValues, Obj);
+  for (auto *DVI : DbgValues) {
+    auto *LV = DVI->getVariable();
+    if (LV->getName() != "") {
+      add(ID, LV->getLine(), LV->getFilename(), LV->getDirectory(),
+          LV->getName());
+      return ID;
+    }
+  }
+
+  add(ID);
+  return ID;
+}
+
+PointerType *ObjectTable::getPointerType(LLVMContext &C) {
+  return PointerType::get(getSourceLocStructType(C), 0);
+}
+
+StructType *ObjectTable::getSourceLocStructType(LLVMContext &C) {
+  return StructType::get(
+      /* Name */ PointerType::get(IntegerType::get(C, 8), 0),
+      /* Line */ IntegerType::get(C, 32),
+      /* File */ PointerType::get(IntegerType::get(C, 8), 0));
+}
+
+void ObjectTable::add(uint64_t ID, int32_t Line,
+                      StringRef Filename, StringRef Directory,
+                      StringRef Name) {
+  assert(LocalIdToSourceLocationMap.find(ID) ==
+             LocalIdToSourceLocationMap.end() &&
+         "Id already exists in FED table.");
+  LocalIdToSourceLocationMap[ID] = {Name, Line, Filename, Directory};
+}
+
+Constant *ObjectTable::insertIntoModule(Module &M) const {
+  LLVMContext &C = M.getContext();
+  StructType *TableType = getSourceLocStructType(C);
+  IntegerType *Int32Ty = IntegerType::get(C, 32);
+  Constant *Zero = ConstantInt::get(Int32Ty, 0);
+  Value *GepArgs[] = {Zero, Zero};
+  SmallVector<Constant *, 6> TableEntries;
+
+  for (uint64_t LocalID = 0; LocalID < IdCounter; ++LocalID) {
+    const SourceLocation &E = LocalIdToSourceLocationMap.find(LocalID)->second;
+    Constant *Line = ConstantInt::get(Int32Ty, E.Line);
+    Constant *File;
+    {
+      std::string Filename = E.Filename.str();
+      if (!E.Directory.empty())
+        Filename = E.Directory.str() + "/" + Filename;
+      Constant *FileStrConstant = ConstantDataArray::getString(C, Filename);
+      GlobalVariable *GV =
+        M.getGlobalVariable("__csi_unit_filename_" + Filename, true);
+      if (GV == NULL) {
+        GV = new GlobalVariable(M, FileStrConstant->getType(),
+                                true, GlobalValue::PrivateLinkage,
+                                FileStrConstant,
+                                "__csi_unit_filename_" + Filename,
+                                nullptr,
+                                GlobalVariable::NotThreadLocal, 0);
+        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      }
+      assert(GV);
+      File =
+        ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+    }
+    Constant *Name;
+    if (E.Name.empty())
+      Name = ConstantPointerNull::get(PointerType::get(
+                                          IntegerType::get(C, 8), 0));
+    else {
+      Constant *NameStrConstant = ConstantDataArray::getString(C, E.Name);
+      GlobalVariable *GV =
+        M.getGlobalVariable(("__csi_unit_object_name_" + E.Name).str(), true);
+      if (GV == NULL) {
+        GV = new GlobalVariable(M, NameStrConstant->getType(),
+                                true, GlobalValue::PrivateLinkage,
+                                NameStrConstant,
+                                "__csi_unit_object_name_" + E.Name,
+                                nullptr,
+                                GlobalVariable::NotThreadLocal, 0);
+        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      }
+      assert(GV);
+      Name =
+        ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+    }
+    // The order of arguments to ConstantStruct::get() must match the
+    // source_loc_t type in csi.h.
+    TableEntries.push_back(ConstantStruct::get(TableType, Name, Line, File));
+  }
+
+  ArrayType *TableArrayType = ArrayType::get(TableType, TableEntries.size());
+  Constant *Table = ConstantArray::get(TableArrayType, TableEntries);
+  GlobalVariable *GV =
+    new GlobalVariable(M, TableArrayType, false, GlobalValue::InternalLinkage,
+                       Table, CsiUnitObjTableName);
+  return ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+}
+
+bool CilkSanitizerImpl::run() {
+  initializeCsi();
+  initializeCsanFEDTables();
+  initializeCsanObjectTables();
+  initializeCsanHooks();
+
+  for (Function &F : M) {
+    DEBUG(dbgs() << "Instrumenting " << F.getName() << "\n");
+    instrumentFunction(F);
+  }
+
+  collectUnitFEDTables();
+  collectUnitObjectTables();
+  finalizeCsi();
+  return true;
+}
+
+void CilkSanitizerImpl::initializeCsanFEDTables() {
+  DetachFED = FrontEndDataTable(M, CsanDetachBaseIdName);
+  TaskFED = FrontEndDataTable(M, CsanTaskBaseIdName);
+  TaskExitFED = FrontEndDataTable(M, CsanTaskExitBaseIdName);
+  DetachContinueFED = FrontEndDataTable(M, CsanDetachContinueBaseIdName);
+  SyncFED = FrontEndDataTable(M, CsanSyncBaseIdName);
+}
+
+void CilkSanitizerImpl::initializeCsanObjectTables() {
+  LoadObj = ObjectTable(M, CsiLoadBaseIdName);
+  StoreObj = ObjectTable(M, CsiStoreBaseIdName);
+}
+
+void CilkSanitizerImpl::collectUnitFEDTables() {
+  CSIImpl::collectUnitFEDTables();
+  LLVMContext &C = M.getContext();
+  StructType *UnitFedTableType =
+      getUnitFedTableType(C, FrontEndDataTable::getPointerType(C));
+
+  // The order of the FED tables here must match the enum in csanrt.c and the
+  // csan_instrumentation_counts_t in csan.h.
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, DetachFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, TaskFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, TaskExitFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, DetachContinueFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, SyncFED));
+}
+
+// Create a struct type to match the unit_obj_entry_t type in csanrt.c.
+StructType *CilkSanitizerImpl::getUnitObjTableType(LLVMContext &C,
+                                                   PointerType *EntryPointerType) {
+  return StructType::get(IntegerType::get(C, 64),
+                         EntryPointerType);
+}
+
+Constant *CilkSanitizerImpl::objTableToUnitObjTable(
+    Module &M, StructType *UnitObjTableType, ObjectTable &ObjTable) {
+  Constant *NumEntries =
+    ConstantInt::get(IntegerType::get(M.getContext(), 64), ObjTable.size());
+  // Constant *BaseIdPtr =
+  //   ConstantExpr::getPointerCast(FedTable.baseId(),
+  //                                Type::getInt8PtrTy(M.getContext(), 0));
+  Constant *InsertedTable = ObjTable.insertIntoModule(M);
+  return ConstantStruct::get(UnitObjTableType, NumEntries,
+                             InsertedTable);
+}
+
+void CilkSanitizerImpl::collectUnitObjectTables() {
+  LLVMContext &C = M.getContext();
+  StructType *UnitObjTableType =
+      getUnitObjTableType(C, ObjectTable::getPointerType(C));
+
+  UnitObjTables.push_back(
+      objTableToUnitObjTable(M, UnitObjTableType, LoadObj));
+  UnitObjTables.push_back(
+      objTableToUnitObjTable(M, UnitObjTableType, StoreObj));
+}
+
+CallInst *CilkSanitizerImpl::createRTUnitInitCall(IRBuilder<> &IRB) {
+  LLVMContext &C = M.getContext();
+
+  StructType *UnitFedTableType =
+      getUnitFedTableType(C, FrontEndDataTable::getPointerType(C));
+  StructType *UnitObjTableType =
+      getUnitObjTableType(C, ObjectTable::getPointerType(C));
+
+  // Lookup __csirt_unit_init
+  SmallVector<Type *, 4> InitArgTypes({IRB.getInt8PtrTy(),
+                                       PointerType::get(UnitFedTableType, 0),
+                                       PointerType::get(UnitObjTableType, 0),
+                                       InitCallsiteToFunction->getType()});
+  FunctionType *InitFunctionTy =
+      FunctionType::get(IRB.getVoidTy(), InitArgTypes, false);
+  RTUnitInit = checkCsiInterfaceFunction(
+      M.getOrInsertFunction(CsiRtUnitInitName, InitFunctionTy));
+  assert(RTUnitInit);
+
+  ArrayType *UnitFedTableArrayType =
+      ArrayType::get(UnitFedTableType, UnitFedTables.size());
+  Constant *FEDTable = ConstantArray::get(UnitFedTableArrayType, UnitFedTables);
+  GlobalVariable *FEDGV = new GlobalVariable(M, UnitFedTableArrayType, false,
+                                             GlobalValue::InternalLinkage, FEDTable,
+                                             CsiUnitFedTableArrayName);
+
+  ArrayType *UnitObjTableArrayType =
+      ArrayType::get(UnitObjTableType, UnitObjTables.size());
+  Constant *ObjTable = ConstantArray::get(UnitObjTableArrayType, UnitObjTables);
+  GlobalVariable *ObjGV = new GlobalVariable(M, UnitObjTableArrayType, false,
+                                             GlobalValue::InternalLinkage, ObjTable,
+                                             CsiUnitObjTableArrayName);
+
+  Constant *Zero = ConstantInt::get(IRB.getInt32Ty(), 0);
+  Value *GepArgs[] = {Zero, Zero};
+
+  // Insert call to __csirt_unit_init
+  return IRB.CreateCall(
+      RTUnitInit,
+      {IRB.CreateGlobalStringPtr(M.getName()),
+          ConstantExpr::getGetElementPtr(FEDGV->getValueType(), FEDGV, GepArgs),
+          ConstantExpr::getGetElementPtr(ObjGV->getValueType(), ObjGV, GepArgs),
+          InitCallsiteToFunction});
+}
+
+void CilkSanitizerImpl::initializeCsanHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+  Type *FuncPropertyTy = CsiFuncProperty::getType(C);
+  Type *FuncExitPropertyTy = CsiFuncExitProperty::getType(C);
+  Type *LoadPropertyTy = CsiLoadStoreProperty::getType(C);
+  Type *StorePropertyTy = CsiLoadStoreProperty::getType(C);
+  Type *RetType = IRB.getVoidTy();
+  Type *AddrType = IRB.getInt8PtrTy();
+  Type *NumBytesType = IRB.getInt32Ty();
+  Type *LargeNumBytesType = IntptrTy;
+  Type *IDType = IRB.getInt64Ty();
+
+  CsanFuncEntry = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_func_entry", RetType,
+                            /* func_id */ IDType,
+                            /* stack_ptr */ AddrType,
+                            FuncPropertyTy));
+  CsanFuncExit = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_func_exit", RetType,
+                            /* func_exit_id */ IDType,
+                            /* func_id */ IDType,
+                            FuncExitPropertyTy));
+
+  CsanRead = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_load", RetType, IDType,
+                            AddrType, NumBytesType, LoadPropertyTy));
+  CsanWrite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_store", RetType, IDType,
+                            AddrType, NumBytesType, StorePropertyTy));
+  CsanLargeRead = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_large_load", RetType, IDType,
+                            AddrType, LargeNumBytesType, LoadPropertyTy));
+  CsanLargeWrite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_large_store", RetType, IDType,
+                            AddrType, LargeNumBytesType, StorePropertyTy));
+  // CsanWrite = checkCsiInterfaceFunction(
+  //     M.getOrInsertFunction("__csan_atomic_exchange", RetType, IDType,
+  //                           AddrType, NumBytesType, StorePropertyTy));
+
+  CsanDetach = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_detach", RetType,
+                            /* detach_id */ IDType));
+  CsanTaskEntry = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_task", RetType,
+                            /* task_id */ IDType,
+                            /* detach_id */ IDType,
+                            /* stack_ptr */ AddrType));
+  CsanTaskExit = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_task_exit", RetType,
+                            /* task_exit_id */ IDType,
+                            /* task_id */ IDType,
+                            /* detach_id */ IDType));
+  CsanDetachContinue = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_detach_continue", RetType,
+                            /* detach_continue_id */ IDType,
+                            /* detach_id */ IDType));
+  CsanSync = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_sync", RetType, IDType));
+}
+
+// Do not instrument known races/"benign races" that come from compiler
+// instrumentatin. The user has no way of suppressing them.
+static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) {
+  // Peel off GEPs and BitCasts.
+  Addr = Addr->stripInBoundsOffsets();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->hasSection()) {
+      StringRef SectionName = GV->getSection();
+      // Check if the global is in the PGO counters section.
+      auto OF = Triple(M->getTargetTriple()).getObjectFormat();
+      if (SectionName.endswith(
+              getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
+        return false;
+    }
+
+    // Check if the global is private gcov data.
+    if (GV->getName().startswith("__llvm_gcov") ||
+        GV->getName().startswith("__llvm_gcda"))
+      return false;
+  }
+
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  if (Addr) {
+    Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0)
+      return false;
+  }
+
+  return true;
+}
+
+// Examine the uses of a given AllocaInst to determine if some use is detached.
+static bool MightHaveDetachedUse(const AllocaInst *AI) {
+  const BasicBlock *AllocaCtx = GetDetachedCtx(AI->getParent());
+  SmallVector<const Use *, 20> Worklist;
+  SmallSet<const Use *, 20> Visited;
+
+  for (const Use &U : AI->uses()) {
+    Visited.insert(&U);
+    Worklist.push_back(&U);
+  }
+
+  while (!Worklist.empty()) {
+    const Use *U = Worklist.pop_back_val();
+    Instruction *I = cast<Instruction>(U->getUser());
+    if (AllocaCtx != GetDetachedCtx(I->getParent()))
+      return true;
+
+    switch (I->getOpcode()) {
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::PHI:
+    case Instruction::Select:
+    case Instruction::AddrSpaceCast:
+      for (Use &UU : I->uses())
+        if (Visited.insert(&UU).second)
+          Worklist.push_back(&UU);
+      break;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+void CilkSanitizerImpl::chooseInstructionsToInstrument(
+    SmallVectorImpl<Instruction *> &Local, SmallVectorImpl<Instruction *> &All,
+    const DataLayout &DL) {
+  SmallSet<Value*, 8> WriteTargets;
+  // Iterate from the end.
+  for (Instruction *I : reverse(Local)) {
+    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+      Value *Addr = Store->getPointerOperand();
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
+        continue;
+      WriteTargets.insert(Addr);
+    } else {
+      LoadInst *Load = cast<LoadInst>(I);
+      Value *Addr = Load->getPointerOperand();
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
+        continue;
+      if (WriteTargets.count(Addr)) {
+        // We will write to this temp, so no reason to analyze the read.
+        NumOmittedReadsBeforeWrite++;
+        continue;
+      }
+      if (addrPointsToConstantData(Addr)) {
+        // Addr points to some constant data -- it can not race with any writes.
+        NumOmittedReadsFromConstants++;
+        continue;
+      }
+    }
+    Value *Addr = isa<StoreInst>(*I)
+        ? cast<StoreInst>(I)->getPointerOperand()
+        : cast<LoadInst>(I)->getPointerOperand();
+    Value *Obj = GetUnderlyingObject(Addr, DL);
+    if (isa<AllocaInst>(Obj) &&
+        !PointerMayBeCaptured(Addr, true, true) &&
+        !MightHaveDetachedUse(cast<AllocaInst>(Obj))) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+      continue;
+    }
+    All.push_back(I);
+  }
+  Local.clear();
+}
+
+bool CilkSanitizerImpl::instrumentFunction(Function &F) {
+  if (F.empty() || shouldNotInstrumentFunction(F))
+    return false;
+
+  DominatorTree *DT = &GetDomTree(F);
+  // DetachSSA &DSSA = GetDSSA(F);
+  // MemorySSA &MSSA = GetMSSA(F);
+
+  SmallVector<Instruction*, 8> AllLoadsAndStores;
+  SmallVector<Instruction*, 8> LocalLoadsAndStores;
+  SmallVector<Instruction*, 8> AtomicAccesses;
+  SmallVector<Instruction*, 8> MemIntrinCalls;
+  SmallVector<Instruction *, 8> Callsites;
+  SmallVector<DetachInst*, 8> Detaches;
+  SmallVector<SyncInst*, 8> Syncs;
+  bool Res = false;
+  bool HasCalls = false;
+  bool MaySpawn = false;
+
+  // TODO: Consider modifying this to choose instrumentation to insert based on
+  // fibrils, not basic blocks.
+  for (BasicBlock &BB : F) {
+    // Record the Tapir instructions found
+    if (DetachInst *DI = dyn_cast<DetachInst>(BB.getTerminator())) {
+      MaySpawn = true;
+      Detaches.push_back(DI);
+    } else if (SyncInst *SI = dyn_cast<SyncInst>(BB.getTerminator()))
+      Syncs.push_back(SI);
+
+    // Record the memory accesses in the basic block
+    for (Instruction &Inst : BB) {
+      if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+        LocalLoadsAndStores.push_back(&Inst);
+      else if (isa<AtomicRMWInst>(Inst) || isa<AtomicCmpXchgInst>(Inst))
+        AtomicAccesses.push_back(&Inst);
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+        if (isa<MemIntrinsic>(Inst))
+          MemIntrinCalls.push_back(&Inst);
+        if (!isa<DbgInfoIntrinsic>(Inst)) {
+          if (!isa<MemIntrinsic>(Inst))
+            Callsites.push_back(&Inst);
+          HasCalls = true;
+          chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores,
+                                         DL);
+        }
+      }
+    }
+    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL);
+  }
+
+  uint64_t LocalId = getLocalFunctionID(F);
+
+  for (auto Inst : AllLoadsAndStores)
+    Res |= instrumentLoadOrStore(Inst, DL);
+
+  for (auto Inst : AtomicAccesses)
+    Res |= instrumentAtomic(Inst, DL);
+
+  for (auto Inst : MemIntrinCalls)
+    Res |= instrumentMemIntrinsic(Inst, DL);
+
+  for (auto Inst : Callsites)
+    Res |= instrumentCallsite(Inst, DT);
+
+  for (auto Inst : Detaches)
+    Res |= instrumentDetach(Inst, DT);
+
+  for (auto Inst : Syncs)
+    Res |= instrumentSync(Inst);
+
+  if ((Res || HasCalls)) {
+    IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt());
+    CsiFuncProperty FuncEntryProp;
+    FuncEntryProp.setMaySpawn(MaySpawn);
+    Value *FuncId = FunctionFED.localToGlobalId(LocalId, IRB);
+    // TODO: Determine if we actually want the frame pointer, not the stack
+    // pointer.
+    // Value *StackSave = IRB.CreateCall(
+    //     Intrinsic::getDeclaration(&M, Intrinsic::stacksave));
+    // IRB.CreateCall(CsanFuncEntry, {FuncId, StackSave, FuncEntryProp.getValue(IRB)});
+    Value *FrameAddr = IRB.CreateCall(
+        Intrinsic::getDeclaration(&M, Intrinsic::frameaddress),
+        {IRB.getInt32(0)});
+    IRB.CreateCall(CsanFuncEntry, {FuncId, FrameAddr, FuncEntryProp.getValue(IRB)});
+
+    EscapeEnumerator EE(F, "csan_cleanup", true);
+    while (IRBuilder<> *AtExit = EE.Next()) {
+      // uint64_t ExitLocalId = FunctionExitFED.add(F);
+      uint64_t ExitLocalId = FunctionExitFED.add(*AtExit->GetInsertPoint());
+      Value *ExitCsiId = FunctionExitFED.localToGlobalId(ExitLocalId, *AtExit);
+      CsiFuncExitProperty FuncExitProp;
+      FuncExitProp.setMaySpawn(MaySpawn);
+      AtExit->CreateCall(CsanFuncExit,
+                         {ExitCsiId, FuncId, FuncExitProp.getValue(*AtExit)});
+    }
+  }
+  return Res;
+}
+
+bool CilkSanitizerImpl::instrumentLoadOrStore(Instruction *I,
+                                              const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  bool IsWrite = isa<StoreInst>(*I);
+  Value *Addr = IsWrite
+      ? cast<StoreInst>(I)->getPointerOperand()
+      : cast<LoadInst>(I)->getPointerOperand();
+
+  // swifterror memory addresses are mem2reg promoted by instruction selection.
+  // As such they cannot have regular uses like an instrumentation function and
+  // it makes no sense to track them as memory.
+  if (Addr->isSwiftError())
+    return false;
+
+  int NumBytesAccessed = getNumBytesAccessed(Addr, DL);
+  if (-1 == NumBytesAccessed) {
+    // Ignore accesses with bad sizes.
+    NumAccessesWithBadSize++;
+    return false;
+  }
+
+  const unsigned Alignment = IsWrite
+      ? cast<StoreInst>(I)->getAlignment()
+      : cast<LoadInst>(I)->getAlignment();
+  CsiLoadStoreProperty Prop;
+  Prop.setAlignment(Alignment);
+  if (IsWrite) {
+    uint64_t LocalId = StoreFED.add(*I);
+    uint64_t StoreObjId = StoreObj.add(*I, Addr, DL);
+    assert(LocalId == StoreObjId &&
+           "Store received different ID's in FED and object tables.");
+    Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB);
+    Value *Args[] = {CsiId,
+                     IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                     IRB.getInt32(NumBytesAccessed),
+                     Prop.getValue(IRB)};
+    Instruction *Call = IRB.CreateCall(CsanWrite, Args);
+    IRB.SetInstDebugLocation(Call);
+    NumInstrumentedWrites++;
+  } else {
+    uint64_t LocalId = LoadFED.add(*I);
+    uint64_t LoadObjId = LoadObj.add(*I, Addr, DL);
+    assert(LocalId == LoadObjId &&
+           "Load received different ID's in FED and object tables.");
+    Value *CsiId = LoadFED.localToGlobalId(LocalId, IRB);
+    Value *Args[] = {CsiId,
+                     IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                     IRB.getInt32(NumBytesAccessed),
+                     Prop.getValue(IRB)};
+    Instruction *Call = IRB.CreateCall(CsanRead, Args);
+    IRB.SetInstDebugLocation(Call);
+    NumInstrumentedReads++;
+  }
+  return true;
+}
+
+bool CilkSanitizerImpl::instrumentAtomic(Instruction *I, const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  CsiLoadStoreProperty Prop;
+  Value *Addr;
+  if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    Addr = RMWI->getPointerOperand();
+  } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Addr = CASI->getPointerOperand();
+  } else {
+    return false;
+  }
+
+  Value *Obj = GetUnderlyingObject(Addr, DL);
+  if (isa<AllocaInst>(Obj) &&
+      !PointerMayBeCaptured(Addr, true, true) &&
+      !MightHaveDetachedUse(cast<AllocaInst>(Obj))) {
+    // The variable is addressable but not captured, so it cannot be
+    // referenced from a different thread and participate in a data race
+    // (see llvm/Analysis/CaptureTracking.h for details).
+    NumOmittedNonCaptured++;
+    return false;
+  }
+
+  int NumBytesAccessed = getNumBytesAccessed(Addr, DL);
+  if (-1 == NumBytesAccessed) {
+    // Ignore accesses with bad sizes.
+    NumAccessesWithBadSize++;
+    return false;
+  }
+
+  uint64_t LocalId = StoreFED.add(*I);
+  uint64_t StoreObjId = StoreObj.add(*I, Addr, DL);
+  assert(LocalId == StoreObjId &&
+         "Store received different ID's in FED and object tables.");
+  Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB);
+  Value *Args[] = {CsiId,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                   IRB.getInt32(NumBytesAccessed),
+                   Prop.getValue(IRB)};
+  Instruction *Call = IRB.CreateCall(CsanWrite, Args);
+  IRB.SetInstDebugLocation(Call);
+  NumInstrumentedWrites++;
+  return true;
+}
+
+bool CilkSanitizerImpl::instrumentMemIntrinsic(Instruction *I,
+                                               const DataLayout &DL) {
+  CsiLoadStoreProperty Prop;
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    // Check if we need to instrument the memset.
+    Value *Addr = M->getArgOperand(0);
+    Value *Obj = GetUnderlyingObject(Addr, DL);
+    if (isa<AllocaInst>(Obj) &&
+        !PointerMayBeCaptured(Addr, true, true) &&
+        !MightHaveDetachedUse(cast<AllocaInst>(Obj))) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+      return false;
+    }
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(M->getArgOperand(3)))
+      Prop.setAlignment(CI->getZExtValue());
+    uint64_t LocalId = StoreFED.add(*I);
+    uint64_t StoreObjId = StoreObj.add(*I, Addr, DL);
+    assert(LocalId == StoreObjId &&
+           "Store received different ID's in FED and object tables.");
+    Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB);
+    Value *Args[] = {CsiId,
+                     IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                     IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false),
+                     Prop.getValue(IRB)};
+    Instruction *Call = IRB.CreateCall(CsanLargeWrite, Args);
+    IRB.SetInstDebugLocation(Call);
+    return true;
+
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(M->getArgOperand(3)))
+      Prop.setAlignment(CI->getZExtValue());
+    Value *StoreAddr = M->getArgOperand(0);
+    Value *LoadAddr = M->getArgOperand(1);
+    bool Instrumented = false;
+
+    // First check if we need to instrument the store.
+    Value *SObj = GetUnderlyingObject(StoreAddr, DL);
+    if (isa<AllocaInst>(SObj) &&
+        !PointerMayBeCaptured(StoreAddr, true, true) &&
+        !MightHaveDetachedUse(cast<AllocaInst>(SObj))) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+    } else {
+      // Instrument the store
+      uint64_t StoreId = StoreFED.add(*I);
+      uint64_t StoreObjId = StoreObj.add(*I, StoreAddr, DL);
+      assert(StoreId == StoreObjId &&
+             "Store received different ID's in FED and object tables.");
+      Value *StoreCsiId = StoreFED.localToGlobalId(StoreId, IRB);
+      Value *StoreArgs[] = {StoreCsiId,
+                            IRB.CreatePointerCast(StoreAddr, IRB.getInt8PtrTy()),
+                            IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false),
+                            Prop.getValue(IRB)};
+      Instruction *WriteCall = IRB.CreateCall(CsanLargeWrite, StoreArgs);
+      IRB.SetInstDebugLocation(WriteCall);
+      Instrumented = true;
+    }
+    Value *LObj = GetUnderlyingObject(LoadAddr, DL);
+    if (isa<AllocaInst>(LObj) &&
+        !PointerMayBeCaptured(LoadAddr, true, true) &&
+        !MightHaveDetachedUse(cast<AllocaInst>(LObj))) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+    } else {
+      // Instrument the load
+      uint64_t LoadId = LoadFED.add(*I);
+      uint64_t LoadObjId = LoadObj.add(*I, LoadAddr, DL);
+      assert(LoadId == LoadObjId &&
+             "Load received different ID's in FED and object tables.");
+      Value *LoadCsiId = StoreFED.localToGlobalId(LoadId, IRB);
+      Value *LoadArgs[] = {LoadCsiId,
+                           IRB.CreatePointerCast(LoadAddr, IRB.getInt8PtrTy()),
+                           IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false),
+                           Prop.getValue(IRB)};
+      Instruction *ReadCall = IRB.CreateCall(CsanLargeRead, LoadArgs);
+      IRB.SetInstDebugLocation(ReadCall);
+      Instrumented = true;
+    }
+    return Instrumented;
+  }
+  return false;
+}
+
+bool CilkSanitizerImpl::instrumentCallsite(Instruction *I, DominatorTree *DT) {
+  // Exclude calls to the syncregion.start intrinsic.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    if (Intrinsic::syncregion_start == II->getIntrinsicID() ||
+        Intrinsic::lifetime_start == II->getIntrinsicID() ||
+        Intrinsic::lifetime_end == II->getIntrinsicID())
+      return false;
+
+  bool IsInvoke = isa<InvokeInst>(I);
+
+  Function *Called = NULL;
+  if (CallInst *CI = dyn_cast<CallInst>(I))
+    Called = CI->getCalledFunction();
+  else if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+    Called = II->getCalledFunction();
+
+  IRBuilder<> IRB(I);
+  uint64_t LocalId = CallsiteFED.add(*I);
+  Value *CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+  Value *FuncId = NULL;
+  GlobalVariable *FuncIdGV = NULL;
+  if (Called) {
+    Module *M = I->getParent()->getParent()->getParent();
+    std::string GVName =
+      CsiFuncIdVariablePrefix + Called->getName().str();
+    FuncIdGV = dyn_cast<GlobalVariable>(M->getOrInsertGlobal(GVName,
+                                                             IRB.getInt64Ty()));
+    assert(FuncIdGV);
+    FuncIdGV->setConstant(false);
+    FuncIdGV->setLinkage(GlobalValue::WeakAnyLinkage);
+    FuncIdGV->setInitializer(IRB.getInt64(CsiCallsiteUnknownTargetId));
+    FuncId = IRB.CreateLoad(FuncIdGV);
+  } else {
+    // Unknown targets (i.e. indirect calls) are always unknown.
+    FuncId = IRB.getInt64(CsiCallsiteUnknownTargetId);
+  }
+  assert(FuncId != NULL);
+  CsiCallProperty Prop;
+  Prop.setIsIndirect(!Called);
+  Value *PropVal = Prop.getValue(IRB);
+  insertConditionalHookCall(I, CsiBeforeCallsite,
+                            {CallsiteId, FuncId, PropVal});
+
+  BasicBlock::iterator Iter(I);
+  if (IsInvoke) {
+    // There are two "after" positions for invokes: the normal block
+    // and the exception block. This also means we have to recompute
+    // the callsite and function IDs in each basic block so that we
+    // can use it for the after hook.
+
+    // TODO: Do we want the "after" hook for this callsite to come
+    // before or after the BB entry hook? Currently it is inserted
+    // before BB entry because instrumentCallsite is called after
+    // instrumentBasicBlock.
+
+    // TODO: If a destination of an invoke has multiple predecessors, then we
+    // must split that destination.
+    InvokeInst *II = dyn_cast<InvokeInst>(I);
+    BasicBlock *NormalBB = II->getNormalDest();
+    unsigned SuccNum = GetSuccessorNumber(II->getParent(), NormalBB);
+    if (isCriticalEdge(II, SuccNum))
+      NormalBB = SplitCriticalEdge(II, SuccNum,
+                                   CriticalEdgeSplittingOptions(DT));
+    IRB.SetInsertPoint(&*NormalBB->getFirstInsertionPt());
+    CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+    if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+
+    BasicBlock *UnwindBB = II->getUnwindDest();
+    IRB.SetInsertPoint(&*UnwindBB->getFirstInsertionPt());
+    CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+    if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+  } else {
+    // Simple call instruction; there is only one "after" position.
+    Iter++;
+    IRB.SetInsertPoint(&*Iter);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*Iter, CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+  }
+
+  return true;
+}
+
+bool CilkSanitizerImpl::instrumentDetach(DetachInst *DI,
+                                         DominatorTree *DT) {
+  // Instrument the detach instruction itself
+  Value *DetachID;
+  {
+    IRBuilder<> IRB(DI);
+    uint64_t LocalID = DetachFED.add(*DI);
+    DetachID = DetachFED.localToGlobalId(LocalID, IRB);
+    Instruction *Call = IRB.CreateCall(CsanDetach, {DetachID});
+    IRB.SetInstDebugLocation(Call);
+  }
+  NumInstrumentedDetaches++;
+
+  // Find the detached block, continuation, and associated reattaches.
+  BasicBlock *DetachedBlock = DI->getDetached();
+  BasicBlock *ContinueBlock = DI->getContinue();
+  SmallVector<BasicBlock *, 8> TaskExits;
+  // TODO: Extend this loop to find EH exits of the detached task.
+  for (BasicBlock *Pred : predecessors(ContinueBlock))
+    if (isa<ReattachInst>(Pred->getTerminator()))
+      TaskExits.push_back(Pred);
+
+  // Instrument the entry and exit points of the detached task.
+  {
+    // Instrument the entry point of the detached task.
+    IRBuilder<> IRB(&*DetachedBlock->getFirstInsertionPt());
+    uint64_t LocalID = TaskFED.add(*DetachedBlock);
+    Value *TaskID = TaskFED.localToGlobalId(LocalID, IRB);
+    // TODO: Determine if we actually want the frame pointer, not the stack
+    // pointer.
+    // Value *StackSave = IRB.CreateCall(
+    //     Intrinsic::getDeclaration(&M, Intrinsic::stacksave));
+    // Instruction *Call = IRB.CreateCall(CsanTaskEntry,
+    //                                    {TaskID, DetachID, StackSave});
+    Value *FrameAddr = IRB.CreateCall(
+        Intrinsic::getDeclaration(&M, Intrinsic::frameaddress),
+        {IRB.getInt32(0)});
+    Instruction *Call = IRB.CreateCall(CsanTaskEntry,
+                                       {TaskID, DetachID, FrameAddr});
+    IRB.SetInstDebugLocation(Call);
+
+    // Instrument the exit points of the detached tasks.
+    for (BasicBlock *TaskExit : TaskExits) {
+      IRBuilder<> IRB(TaskExit->getTerminator());
+      uint64_t LocalID = TaskExitFED.add(*TaskExit->getTerminator());
+      Value *TaskExitID = TaskExitFED.localToGlobalId(LocalID, IRB);
+      Instruction *Call = IRB.CreateCall(CsanTaskExit,
+                                         {TaskExitID, TaskID, DetachID});
+      IRB.SetInstDebugLocation(Call);
+      NumInstrumentedDetachExits++;
+    }
+  }
+
+  // Instrument the continuation of the detach.
+  {
+    if (isCriticalContinueEdge(DI, 1))
+      ContinueBlock = SplitCriticalEdge(
+          DI, 1,
+          CriticalEdgeSplittingOptions(DT).setSplitDetachContinue());
+
+    IRBuilder<> IRB(&*ContinueBlock->getFirstInsertionPt());
+    uint64_t LocalID = DetachContinueFED.add(*ContinueBlock);
+    Value *ContinueID = DetachContinueFED.localToGlobalId(LocalID, IRB);
+    Instruction *Call = IRB.CreateCall(CsanDetachContinue,
+                                       {ContinueID, DetachID});
+    IRB.SetInstDebugLocation(Call);
+  }
+  return true;
+}
+
+bool CilkSanitizerImpl::instrumentSync(SyncInst *SI) {
+  IRBuilder<> IRB(SI);
+  // Get the ID of this sync.
+  uint64_t LocalID = SyncFED.add(*SI);
+  Value *SyncID = SyncFED.localToGlobalId(LocalID, IRB);
+  // Insert instrumentation before the sync.
+  Instruction *Call = IRB.CreateCall(CsanSync, {SyncID});
+  IRB.SetInstDebugLocation(Call);
+  NumInstrumentedSyncs++;
+  return true;
+}
+
+bool CilkSanitizer::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  // auto GetDSSA = [this](Function &F) -> DetachSSA & {
+  //   return this->getAnalysis<DetachSSAWrapperPass>(F).getDSSA();
+  // };
+  // auto GetMSSA = [this](Function &F) -> MemorySSA & {
+  //   return this->getAnalysis<MemorySSAWrapperPass>(F).getMSSA();
+  // };
+
+  CallGraph *CG = &getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto GetDomTree = [this](Function &F) -> DominatorTree & {
+    return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  };
+
+  // return CilkSanitizerImpl(M, CG, GetDSSA, GetMSSA).run();
+  return CilkSanitizerImpl(M, CG, GetDomTree, TLI).run();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp
new file mode 100644
index 00000000000000..1446eb4b8e7dd3
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp
@@ -0,0 +1,982 @@
+//===-- ComprehensiveStaticInstrumentation.cpp - instrumentation hooks ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// TODO: License
+//===----------------------------------------------------------------------===//
+//
+// This file is part of CSI, a framework that provides comprehensive static
+// instrumentation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/CSI.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csi"
+
+static cl::opt<bool>  ClInstrumentFuncEntryExit(
+    "csi-instrument-func-entry-exit", cl::init(true),
+    cl::desc("Instrument function entry and exit"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentBasicBlocks(
+    "csi-instrument-basic-blocks", cl::init(true),
+    cl::desc("Instrument basic blocks"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemoryAccesses(
+    "csi-instrument-memory-accesses", cl::init(true),
+    cl::desc("Instrument memory accesses"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentCalls(
+    "csi-instrument-function-calls", cl::init(true),
+    cl::desc("Instrument function calls"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentAtomics(
+    "csi-instrument-atomics", cl::init(true),
+    cl::desc("Instrument atomics"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemIntrinsics(
+    "csi-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
+
+namespace {
+
+static CSIOptions OverrideFromCL(CSIOptions Options) {
+  Options.InstrumentFuncEntryExit |= ClInstrumentFuncEntryExit;
+  Options.InstrumentBasicBlocks |= ClInstrumentBasicBlocks;
+  Options.InstrumentMemoryAccesses |= ClInstrumentMemoryAccesses;
+  Options.InstrumentCalls |= ClInstrumentCalls;
+  Options.InstrumentAtomics |= ClInstrumentAtomics;
+  Options.InstrumentMemIntrinsics |= ClInstrumentMemIntrinsics;
+  return Options;
+}
+
+/// The Comprehensive Static Instrumentation pass.
+/// Inserts calls to user-defined hooks at predefined points in the IR.
+struct ComprehensiveStaticInstrumentation : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid.
+
+  ComprehensiveStaticInstrumentation(
+      const CSIOptions &Options = CSIOptions())
+      : ModulePass(ID), Options(OverrideFromCL(Options)) {
+    initializeComprehensiveStaticInstrumentationPass(
+        *PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override {
+    return "ComprehensiveStaticInstrumentation";
+  }
+  bool runOnModule(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  CSIOptions Options;
+}; // struct ComprehensiveStaticInstrumentation
+} // anonymous namespace
+
+char ComprehensiveStaticInstrumentation::ID = 0;
+
+INITIALIZE_PASS(ComprehensiveStaticInstrumentation, "csi",
+                "ComprehensiveStaticInstrumentation pass", false, false)
+
+ModulePass *llvm::createComprehensiveStaticInstrumentationPass(
+    const CSIOptions &Options) {
+  return new ComprehensiveStaticInstrumentation(Options);
+}
+
+/// Return the first DILocation in the given basic block, or nullptr
+/// if none exists.
+static const DILocation *getFirstDebugLoc(const BasicBlock &BB) {
+  for (const Instruction &Inst : BB)
+    if (const DILocation *Loc = Inst.getDebugLoc())
+      return Loc;
+
+  return nullptr;
+}
+
+/// Set DebugLoc on the call instruction to a CSI hook, based on the
+/// debug information of the instrumented instruction.
+static void setInstrumentationDebugLoc(Instruction *Instrumented,
+                                       Instruction *Call) {
+  DISubprogram *Subprog = Instrumented->getFunction()->getSubprogram();
+  if (Subprog) {
+    if (Instrumented->getDebugLoc()) {
+      Call->setDebugLoc(Instrumented->getDebugLoc());
+    } else {
+      LLVMContext &C = Instrumented->getFunction()->getParent()->getContext();
+      Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog));
+    }
+  }
+}
+
+/// Set DebugLoc on the call instruction to a CSI hook, based on the
+/// debug information of the instrumented instruction.
+static void setInstrumentationDebugLoc(BasicBlock &Instrumented,
+                                       Instruction *Call) {
+  DISubprogram *Subprog = Instrumented.getParent()->getSubprogram();
+  if (Subprog) {
+    if (const DILocation *FirstDebugLoc = getFirstDebugLoc(Instrumented))
+      Call->setDebugLoc(FirstDebugLoc);
+    else {
+      LLVMContext &C = Instrumented.getParent()->getParent()->getContext();
+      Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog));
+    }
+  }
+}
+
+/// Set DebugLoc on the call instruction to a CSI hook, based on the
+/// debug information of the instrumented instruction.
+static void setInstrumentationDebugLoc(Function &Instrumented,
+                                       Instruction *Call) {
+  DISubprogram *Subprog = Instrumented.getSubprogram();
+  if (Subprog) {
+    LLVMContext &C = Instrumented.getParent()->getContext();
+    Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog));
+  }
+}
+
+bool CSIImpl::run() {
+  initializeCsi();
+
+  for (Function &F : M)
+    instrumentFunction(F);
+
+  collectUnitFEDTables();
+  finalizeCsi();
+  return true; // We always insert the unit constructor.
+}
+
+ForensicTable::ForensicTable(Module &M, StringRef BaseIdName) {
+  LLVMContext &C = M.getContext();
+  IntegerType *Int64Ty = IntegerType::get(C, 64);
+  IdCounter = 0;
+  BaseId = new GlobalVariable(M, Int64Ty, false, GlobalValue::InternalLinkage,
+                              ConstantInt::get(Int64Ty, 0), BaseIdName);
+  assert(BaseId);
+}
+
+uint64_t ForensicTable::getId(const Value *V) {
+  if (!ValueToLocalIdMap.count(V))
+    ValueToLocalIdMap[V] = IdCounter++;
+  assert(ValueToLocalIdMap.count(V) && "Value not in ID map.");
+  return ValueToLocalIdMap[V];
+}
+
+Value *ForensicTable::localToGlobalId(uint64_t LocalId,
+                                      IRBuilder<> &IRB) const {
+  assert(BaseId);
+  LLVMContext &C = IRB.getContext();
+  LoadInst *Base = IRB.CreateLoad(BaseId);
+  MDNode *MD = llvm::MDNode::get(C, None);
+  Base->setMetadata(LLVMContext::MD_invariant_load, MD);
+  Value *Offset = IRB.getInt64(LocalId);
+  return IRB.CreateAdd(Base, Offset);
+}
+
+uint64_t FrontEndDataTable::add(const Function &F) {
+  uint64_t ID = getId(&F);
+  add(ID, F.getSubprogram());
+  return ID;
+}
+
+uint64_t FrontEndDataTable::add(const BasicBlock &BB) {
+  uint64_t ID = getId(&BB);
+  add(ID, getFirstDebugLoc(BB));
+  return ID;
+}
+
+uint64_t FrontEndDataTable::add(const Instruction &I) {
+  uint64_t ID = getId(&I);
+  add(ID, I.getDebugLoc());
+  return ID;
+}
+
+PointerType *FrontEndDataTable::getPointerType(LLVMContext &C) {
+  return PointerType::get(getSourceLocStructType(C), 0);
+}
+
+StructType *FrontEndDataTable::getSourceLocStructType(LLVMContext &C) {
+  return StructType::get(
+      /* Name */ PointerType::get(IntegerType::get(C, 8), 0),
+      /* Line */ IntegerType::get(C, 32),
+      /* Column */ IntegerType::get(C, 32),
+      /* File */ PointerType::get(IntegerType::get(C, 8), 0));
+}
+
+void FrontEndDataTable::add(uint64_t ID, const DILocation *Loc) {
+  if (Loc) {
+    // TODO: Add location information for inlining
+    const DISubprogram *Subprog = Loc->getScope()->getSubprogram();
+    add(ID, (int32_t)Loc->getLine(), (int32_t)Loc->getColumn(),
+        Loc->getFilename(), Loc->getDirectory(), Subprog->getName());
+  } else
+    add(ID);
+}
+
+void FrontEndDataTable::add(uint64_t ID, const DISubprogram *Subprog) {
+  if (Subprog)
+    add(ID, (int32_t)Subprog->getLine(), -1, Subprog->getFilename(),
+        Subprog->getDirectory(), Subprog->getName());
+  else
+    add(ID);
+}
+
+void FrontEndDataTable::add(uint64_t ID, int32_t Line, int32_t Column,
+                            StringRef Filename, StringRef Directory,
+                            StringRef Name) {
+  assert(LocalIdToSourceLocationMap.find(ID) ==
+             LocalIdToSourceLocationMap.end() &&
+         "Id already exists in FED table.");
+  LocalIdToSourceLocationMap[ID] = {Name, Line, Column, Filename, Directory};
+}
+
+Constant *FrontEndDataTable::insertIntoModule(Module &M) const {
+  LLVMContext &C = M.getContext();
+  StructType *FedType = getSourceLocStructType(C);
+  IntegerType *Int32Ty = IntegerType::get(C, 32);
+  Constant *Zero = ConstantInt::get(Int32Ty, 0);
+  Value *GepArgs[] = {Zero, Zero};
+  SmallVector<Constant *, 6> FEDEntries;
+
+  for (uint64_t LocalID = 0; LocalID < IdCounter; ++LocalID) {
+    const SourceLocation &E = LocalIdToSourceLocationMap.find(LocalID)->second;
+    Constant *Line = ConstantInt::get(Int32Ty, E.Line);
+    Constant *Column = ConstantInt::get(Int32Ty, E.Column);
+    Constant *File;
+    {
+      std::string Filename = E.Filename.str();
+      if (!E.Directory.empty())
+        Filename = E.Directory.str() + "/" + Filename;
+      Constant *FileStrConstant = ConstantDataArray::getString(C, Filename);
+      GlobalVariable *GV =
+        M.getGlobalVariable("__csi_unit_filename_" + Filename, true);
+      if (GV == NULL) {
+        GV = new GlobalVariable(M, FileStrConstant->getType(),
+                                true, GlobalValue::PrivateLinkage,
+                                FileStrConstant,
+                                "__csi_unit_filename_" + Filename,
+                                nullptr,
+                                GlobalVariable::NotThreadLocal, 0);
+        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      }
+      assert(GV);
+      File =
+        ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+    }
+    Constant *Name;
+    if (E.Name.empty())
+      Name = ConstantPointerNull::get(PointerType::get(
+                                          IntegerType::get(C, 8), 0));
+    else {
+      Constant *NameStrConstant = ConstantDataArray::getString(C, E.Name);
+      GlobalVariable *GV =
+        M.getGlobalVariable(("__csi_unit_function_name_" + E.Name).str(), true);
+      if (GV == NULL) {
+        GV = new GlobalVariable(M, NameStrConstant->getType(),
+                                true, GlobalValue::PrivateLinkage,
+                                NameStrConstant,
+                                "__csi_unit_function_name_" + E.Name,
+                                nullptr,
+                                GlobalVariable::NotThreadLocal, 0);
+        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      }
+      assert(GV);
+      Name =
+        ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+    }
+    // The order of arguments to ConstantStruct::get() must match the
+    // source_loc_t type in csi.h.
+    FEDEntries.push_back(ConstantStruct::get(FedType, Name, Line, Column,
+                                             File));
+  }
+
+  ArrayType *FedArrayType = ArrayType::get(FedType, FEDEntries.size());
+  Constant *Table = ConstantArray::get(FedArrayType, FEDEntries);
+  GlobalVariable *GV =
+    new GlobalVariable(M, FedArrayType, false, GlobalValue::InternalLinkage,
+                       Table, CsiUnitFedTableName);
+  return ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+}
+
+void CSIImpl::initializeFuncHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+  Type *FuncPropertyTy = CsiFuncProperty::getType(C);
+  CsiFuncEntry = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_func_entry", IRB.getVoidTy(),
+                            IRB.getInt64Ty(), FuncPropertyTy));
+  Type *FuncExitPropertyTy = CsiFuncExitProperty::getType(C);
+  CsiFuncExit = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_func_exit", IRB.getVoidTy(),
+                            IRB.getInt64Ty(), IRB.getInt64Ty(),
+                            FuncExitPropertyTy));
+}
+
+void CSIImpl::initializeBasicBlockHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+  Type *PropertyTy = CsiBBProperty::getType(C);
+  CsiBBEntry = checkCsiInterfaceFunction(M.getOrInsertFunction(
+      "__csi_bb_entry", IRB.getVoidTy(), IRB.getInt64Ty(), PropertyTy));
+  CsiBBExit = checkCsiInterfaceFunction(M.getOrInsertFunction(
+      "__csi_bb_exit", IRB.getVoidTy(), IRB.getInt64Ty(), PropertyTy));
+}
+
+void CSIImpl::initializeCallsiteHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+  Type *PropertyTy = CsiCallProperty::getType(C);
+  CsiBeforeCallsite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_before_call", IRB.getVoidTy(),
+                            IRB.getInt64Ty(), IRB.getInt64Ty(), PropertyTy));
+  CsiAfterCallsite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_after_call", IRB.getVoidTy(),
+                            IRB.getInt64Ty(), IRB.getInt64Ty(), PropertyTy));
+}
+
+void CSIImpl::initializeLoadStoreHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+  Type *LoadPropertyTy = CsiLoadStoreProperty::getType(C);
+  Type *StorePropertyTy = CsiLoadStoreProperty::getType(C);
+  Type *RetType = IRB.getVoidTy();
+  Type *AddrType = IRB.getInt8PtrTy();
+  Type *NumBytesType = IRB.getInt32Ty();
+
+  CsiBeforeRead = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_before_load", RetType, IRB.getInt64Ty(),
+                            AddrType, NumBytesType, LoadPropertyTy));
+  CsiAfterRead = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_after_load", RetType, IRB.getInt64Ty(),
+                            AddrType, NumBytesType, LoadPropertyTy));
+
+  CsiBeforeWrite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_before_store", RetType, IRB.getInt64Ty(),
+                            AddrType, NumBytesType, StorePropertyTy));
+  CsiAfterWrite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_after_store", RetType, IRB.getInt64Ty(),
+                            AddrType, NumBytesType, StorePropertyTy));
+}
+
+void CSIImpl::initializeMemIntrinsicsHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+
+  MemmoveFn = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy));
+  MemcpyFn = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy));
+  MemsetFn = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt32Ty(), IntptrTy));
+}
+
+int CSIImpl::getNumBytesAccessed(Value *Addr, const DataLayout &DL) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  if (TypeSize % 8 != 0) {
+  // if (TypeSize != 8 && TypeSize != 16 && TypeSize != 32 && TypeSize != 64 &&
+  //     TypeSize != 128 && TypeSize != 256 && TypeSize != 512) {
+    return -1;
+  }
+  return TypeSize / 8;
+}
+
+void CSIImpl::addLoadStoreInstrumentation(
+    Instruction *I, Function *BeforeFn, Function *AfterFn, Value *CsiId,
+    Type *AddrType, Value *Addr, int NumBytes, CsiLoadStoreProperty &Prop) {
+  IRBuilder<> IRB(I);
+  Value *PropVal = Prop.getValue(IRB);
+  insertConditionalHookCall(I, BeforeFn,
+                            {CsiId, IRB.CreatePointerCast(Addr, AddrType),
+                                IRB.getInt32(NumBytes), PropVal});
+
+  BasicBlock::iterator Iter(I);
+  Iter++;
+  IRB.SetInsertPoint(&*Iter);
+  insertConditionalHookCall(&*Iter, AfterFn,
+                            {CsiId, IRB.CreatePointerCast(Addr, AddrType),
+                                IRB.getInt32(NumBytes), PropVal});
+}
+
+void CSIImpl::instrumentLoadOrStore(Instruction *I, CsiLoadStoreProperty &Prop,
+                                    const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  bool IsWrite = isa<StoreInst>(I);
+  Value *Addr = IsWrite ? cast<StoreInst>(I)->getPointerOperand()
+                        : cast<LoadInst>(I)->getPointerOperand();
+  int NumBytes = getNumBytesAccessed(Addr, DL);
+  Type *AddrType = IRB.getInt8PtrTy();
+
+  if (NumBytes == -1)
+    return; // size that we don't recognize
+
+  if (IsWrite) {
+    uint64_t LocalId = StoreFED.add(*I);
+    Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB);
+    addLoadStoreInstrumentation(I, CsiBeforeWrite, CsiAfterWrite, CsiId,
+                                AddrType, Addr, NumBytes, Prop);
+  } else { // is read
+    uint64_t LocalId = LoadFED.add(*I);
+    Value *CsiId = LoadFED.localToGlobalId(LocalId, IRB);
+    addLoadStoreInstrumentation(I, CsiBeforeRead, CsiAfterRead, CsiId, AddrType,
+                                Addr, NumBytes, Prop);
+  }
+}
+
+void CSIImpl::instrumentAtomic(Instruction *I, const DataLayout &DL) {
+  // For now, print a message that this code contains atomics.
+  dbgs() << "WARNING: Uninstrumented atomic operations in program-under-test!\n";
+}
+
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since our pass runs after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __csi_memset to avoid the intrinsics.
+bool CSIImpl::instrumentMemIntrinsic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    Instruction *Call = IRB.CreateCall(
+                                       MemsetFn,
+                                       {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+                                           IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+                                           IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    setInstrumentationDebugLoc(I, Call);
+    I->eraseFromParent();
+    return true;
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    Instruction *Call = IRB.CreateCall(
+                                       isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+                                       {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+                                           IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+                                           IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    setInstrumentationDebugLoc(I, Call);
+    I->eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+void CSIImpl::instrumentBasicBlock(BasicBlock &BB) {
+  IRBuilder<> IRB(&*BB.getFirstInsertionPt());
+  //LLVMContext &C = IRB.getContext();
+  uint64_t LocalId = BasicBlockFED.add(BB);
+  Value *CsiId = BasicBlockFED.localToGlobalId(LocalId, IRB);
+  CsiBBProperty Prop;
+  TerminatorInst *TI = BB.getTerminator();
+  Value *PropVal = Prop.getValue(IRB);
+  insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiBBEntry,
+                            {CsiId, PropVal});
+  insertConditionalHookCall(TI, CsiBBExit,
+                            {CsiId, PropVal});
+}
+
+void CSIImpl::instrumentCallsite(Instruction *I) {
+  // Ignore calls to debug intrinsics
+  if (isa<DbgInfoIntrinsic>(I))
+    return;
+
+  bool IsInvoke = false;
+  Function *Called = NULL;
+  if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    Called = CI->getCalledFunction();
+  } else if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+    Called = II->getCalledFunction();
+    IsInvoke = true;
+  }
+
+  // if (Called && Called->getName().startswith("llvm.dbg")) {
+  //   return;
+  // }
+
+  IRBuilder<> IRB(I);
+  uint64_t LocalId = CallsiteFED.add(*I);
+  Value *CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+  Value *FuncId = NULL;
+  GlobalVariable *FuncIdGV = NULL;
+  if (Called) {
+    Module *M = I->getParent()->getParent()->getParent();
+    std::string GVName =
+      CsiFuncIdVariablePrefix + Called->getName().str();
+    FuncIdGV = dyn_cast<GlobalVariable>(M->getOrInsertGlobal(GVName,
+                                                             IRB.getInt64Ty()));
+    assert(FuncIdGV);
+    FuncIdGV->setConstant(false);
+    FuncIdGV->setLinkage(GlobalValue::WeakAnyLinkage);
+    FuncIdGV->setInitializer(IRB.getInt64(CsiCallsiteUnknownTargetId));
+    FuncId = IRB.CreateLoad(FuncIdGV);
+  } else {
+    // Unknown targets (i.e. indirect calls) are always unknown.
+    FuncId = IRB.getInt64(CsiCallsiteUnknownTargetId);
+  }
+  assert(FuncId != NULL);
+  CsiCallProperty Prop;
+  Prop.setIsIndirect(!Called);
+  Value *PropVal = Prop.getValue(IRB);
+  insertConditionalHookCall(I, CsiBeforeCallsite,
+                            {CallsiteId, FuncId, PropVal});
+
+  BasicBlock::iterator Iter(I);
+  if (IsInvoke) {
+    // There are two "after" positions for invokes: the normal block
+    // and the exception block. This also means we have to recompute
+    // the callsite and function IDs in each basic block so that we
+    // can use it for the after hook.
+
+    // TODO: Do we want the "after" hook for this callsite to come
+    // before or after the BB entry hook? Currently it is inserted
+    // before BB entry because instrumentCallsite is called after
+    // instrumentBasicBlock.
+    InvokeInst *II = dyn_cast<InvokeInst>(I);
+    BasicBlock *NormalBB = II->getNormalDest();
+    IRB.SetInsertPoint(&*NormalBB->getFirstInsertionPt());
+    CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+    if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+
+    BasicBlock *UnwindBB = II->getUnwindDest();
+    IRB.SetInsertPoint(&*UnwindBB->getFirstInsertionPt());
+    CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+    if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+  } else {
+    // Simple call instruction; there is only one "after" position.
+    Iter++;
+    IRB.SetInsertPoint(&*Iter);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*Iter, CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+  }
+}
+
+void CSIImpl::insertConditionalHookCall(Instruction *I, Function *HookFunction,
+                                        ArrayRef<Value *> HookArgs) {
+  IRBuilder<> IRB(I);
+  // Value *Cond = IRB.CreateICmpEQ(IRB.CreateLoad(DisableInstrGV), IRB.getInt1(false));
+  // TerminatorInst *TI = SplitBlockAndInsertIfThen(Cond, I, false);
+  // IRB.SetInsertPoint(TI);
+  // IRB.CreateStore(IRB.getInt1(true), DisableInstrGV);
+  Instruction *Call = IRB.CreateCall(HookFunction, HookArgs);
+  setInstrumentationDebugLoc(I, Call);
+  // IRB.CreateStore(IRB.getInt1(false), DisableInstrGV);
+}
+
+
+void CSIImpl::initializeFEDTables() {
+  FunctionFED = FrontEndDataTable(M, CsiFunctionBaseIdName);
+  FunctionExitFED = FrontEndDataTable(M, CsiFunctionExitBaseIdName);
+  BasicBlockFED = FrontEndDataTable(M, CsiBasicBlockBaseIdName);
+  CallsiteFED = FrontEndDataTable(M, CsiCallsiteBaseIdName);
+  LoadFED = FrontEndDataTable(M, CsiLoadBaseIdName);
+  StoreFED = FrontEndDataTable(M, CsiStoreBaseIdName);
+}
+
+uint64_t CSIImpl::getLocalFunctionID(Function &F) {
+  uint64_t LocalId = FunctionFED.add(F);
+  FuncOffsetMap[F.getName()] = LocalId;
+  return LocalId;
+}
+
+void CSIImpl::generateInitCallsiteToFunction() {
+  LLVMContext &C = M.getContext();
+  BasicBlock *EntryBB = BasicBlock::Create(C, "", InitCallsiteToFunction);
+  IRBuilder<> IRB(ReturnInst::Create(C, EntryBB));
+
+  GlobalVariable *Base = FunctionFED.baseId();
+  LoadInst *LI = IRB.CreateLoad(Base);
+  // Traverse the map of function name -> function local id. Generate
+  // a store of each function's global ID to the corresponding weak
+  // global variable.
+  for (const auto &it : FuncOffsetMap) {
+    std::string GVName = CsiFuncIdVariablePrefix + it.first.str();
+    GlobalVariable *GV = nullptr;
+    if ((GV = M.getGlobalVariable(GVName)) == nullptr) {
+      GV = new GlobalVariable(M, IRB.getInt64Ty(), false,
+                              GlobalValue::WeakAnyLinkage,
+                              IRB.getInt64(CsiCallsiteUnknownTargetId), GVName);
+    }
+    assert(GV);
+    IRB.CreateStore(IRB.CreateAdd(LI, IRB.getInt64(it.second)), GV);
+  }
+}
+
+void CSIImpl::initializeCsi() {
+  IntptrTy = DL.getIntPtrType(M.getContext());
+
+  initializeFEDTables();
+  if (Options.InstrumentFuncEntryExit)
+    initializeFuncHooks();
+  if (Options.InstrumentMemoryAccesses)
+    initializeLoadStoreHooks();
+  if (Options.InstrumentBasicBlocks)
+    initializeBasicBlockHooks();
+  if (Options.InstrumentCalls)
+    initializeCallsiteHooks();
+  if (Options.InstrumentMemIntrinsics)
+    initializeMemIntrinsicsHooks();
+
+  FunctionType *FnType =
+    FunctionType::get(Type::getVoidTy(M.getContext()), {}, false);
+  InitCallsiteToFunction = checkCsiInterfaceFunction(
+      M.getOrInsertFunction(CsiInitCallsiteToFunctionName, FnType));
+  assert(InitCallsiteToFunction);
+  InitCallsiteToFunction->setLinkage(GlobalValue::InternalLinkage);
+
+  /*
+  The runtime declares this as a __thread var --- need to change this decl generation
+    or the tool won't compile
+  DisableInstrGV = new GlobalVariable(M, IntegerType::get(M.getContext(), 1), false,
+                                      GlobalValue::ExternalLinkage, nullptr,
+                                      CsiDisableInstrumentationName, nullptr,
+                                      GlobalValue::GeneralDynamicTLSModel, 0, true);
+  */
+}
+
+// Create a struct type to match the unit_fed_entry_t type in csirt.c.
+StructType *CSIImpl::getUnitFedTableType(LLVMContext &C,
+                                         PointerType *EntryPointerType) {
+  return StructType::get(IntegerType::get(C, 64),
+                         Type::getInt8PtrTy(C, 0),
+                         EntryPointerType);
+}
+
+Constant *CSIImpl::fedTableToUnitFedTable(Module &M,
+                                          StructType *UnitFedTableType,
+                                          FrontEndDataTable &FedTable) {
+  Constant *NumEntries =
+    ConstantInt::get(IntegerType::get(M.getContext(), 64), FedTable.size());
+  Constant *BaseIdPtr =
+    ConstantExpr::getPointerCast(FedTable.baseId(),
+                                 Type::getInt8PtrTy(M.getContext(), 0));
+  Constant *InsertedTable = FedTable.insertIntoModule(M);
+  return ConstantStruct::get(UnitFedTableType, NumEntries, BaseIdPtr,
+                             InsertedTable);
+}
+
+void CSIImpl::collectUnitFEDTables() {
+  LLVMContext &C = M.getContext();
+  StructType *UnitFedTableType =
+      getUnitFedTableType(C, FrontEndDataTable::getPointerType(C));
+
+  // The order of the FED tables here must match the enum in csirt.c and the
+  // instrumentation_counts_t in csi.h.
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, FunctionFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, FunctionExitFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, BasicBlockFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, CallsiteFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, LoadFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, StoreFED));
+}
+
+CallInst *CSIImpl::createRTUnitInitCall(IRBuilder<> &IRB) {
+  LLVMContext &C = M.getContext();
+
+  StructType *UnitFedTableType =
+      getUnitFedTableType(C, FrontEndDataTable::getPointerType(C));
+
+  // Lookup __csirt_unit_init
+  SmallVector<Type *, 4> InitArgTypes({IRB.getInt8PtrTy(),
+                                       PointerType::get(UnitFedTableType, 0),
+                                       InitCallsiteToFunction->getType()});
+  FunctionType *InitFunctionTy =
+      FunctionType::get(IRB.getVoidTy(), InitArgTypes, false);
+  RTUnitInit = checkCsiInterfaceFunction(
+      M.getOrInsertFunction(CsiRtUnitInitName, InitFunctionTy));
+  assert(RTUnitInit);
+
+  ArrayType *UnitFedTableArrayType =
+      ArrayType::get(UnitFedTableType, UnitFedTables.size());
+  Constant *Table = ConstantArray::get(UnitFedTableArrayType, UnitFedTables);
+  GlobalVariable *GV = new GlobalVariable(M, UnitFedTableArrayType, false,
+                                          GlobalValue::InternalLinkage, Table,
+                                          CsiUnitFedTableArrayName);
+
+  Constant *Zero = ConstantInt::get(IRB.getInt32Ty(), 0);
+  Value *GepArgs[] = {Zero, Zero};
+
+  // Insert call to __csirt_unit_init
+  return IRB.CreateCall(
+      RTUnitInit,
+      {IRB.CreateGlobalStringPtr(M.getName()),
+          ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs),
+          InitCallsiteToFunction});
+}
+
+void CSIImpl::finalizeCsi() {
+  LLVMContext &C = M.getContext();
+
+  // Add CSI global constructor, which calls unit init.
+  Function *Ctor =
+      Function::Create(FunctionType::get(Type::getVoidTy(C), false),
+                       GlobalValue::InternalLinkage, CsiRtUnitCtorName, &M);
+  BasicBlock *CtorBB = BasicBlock::Create(C, "", Ctor);
+  IRBuilder<> IRB(ReturnInst::Create(C, CtorBB));
+
+  // Insert __csi_func_id_<f> weak symbols for all defined functions and
+  // generate the runtime code that stores to all of them.
+  generateInitCallsiteToFunction();
+
+  CallInst *Call = createRTUnitInitCall(IRB);
+
+  // Add the constructor to the global list
+  appendToGlobalCtors(M, Ctor, CsiUnitCtorPriority);
+
+  CallGraphNode *CNCtor = CG->getOrInsertFunction(Ctor);
+  CallGraphNode *CNFunc = CG->getOrInsertFunction(RTUnitInit);
+  CNCtor->addCalledFunction(Call, CNFunc);
+}
+
+bool CSIImpl::shouldNotInstrumentFunction(Function &F) {
+  Module &M = *F.getParent();
+  // Never instrument the CSI ctor.
+  if (F.hasName() && F.getName() == CsiRtUnitCtorName)
+    return true;
+
+  // Don't instrument functions that will run before or
+  // simultaneously with CSI ctors.
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (GV == nullptr)
+    return false;
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  for (Use &OP : CA->operands()) {
+    if (isa<ConstantAggregateZero>(OP))
+      continue;
+    ConstantStruct *CS = cast<ConstantStruct>(OP);
+
+    if (Function *CF = dyn_cast<Function>(CS->getOperand(1))) {
+      uint64_t Priority =
+          dyn_cast<ConstantInt>(CS->getOperand(0))->getLimitedValue();
+      if (Priority <= CsiUnitCtorPriority && CF->getName() == F.getName()) {
+        // Do not instrument F.
+        return true;
+      }
+    }
+  }
+  // false means do instrument it.
+  return false;
+}
+
+bool CSIImpl::isVtableAccess(Instruction *I) {
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
+    return Tag->isTBAAVtableAccess();
+  return false;
+}
+
+bool CSIImpl::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->isConstant()) {
+      return true;
+    }
+  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+    if (isVtableAccess(L)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CSIImpl::isAtomic(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread;
+  if (isa<AtomicRMWInst>(I))
+    return true;
+  if (isa<AtomicCmpXchgInst>(I))
+    return true;
+  if (isa<FenceInst>(I))
+    return true;
+  return false;
+}
+
+void CSIImpl::computeLoadAndStoreProperties(
+    SmallVectorImpl<std::pair<Instruction *, CsiLoadStoreProperty>> &LoadAndStoreProperties,
+    SmallVectorImpl<Instruction *> &BBLoadsAndStores,
+    const DataLayout &DL) {
+  SmallSet<Value *, 8> WriteTargets;
+
+  for (SmallVectorImpl<Instruction *>::reverse_iterator
+         It = BBLoadsAndStores.rbegin(),
+         E = BBLoadsAndStores.rend();
+       It != E; ++It) {
+    Instruction *I = *It;
+    unsigned Alignment;
+    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+      Value *Addr = Store->getPointerOperand();
+      WriteTargets.insert(Addr);
+      CsiLoadStoreProperty Prop;
+      // Update alignment property data
+      Alignment = Store->getAlignment();
+      Prop.setAlignment(Alignment);
+      // Set vtable-access property
+      Prop.setIsVtableAccess(isVtableAccess(Store));
+      // Set constant-data-access property
+      Prop.setIsConstant(addrPointsToConstantData(Addr));
+      Value *Obj = GetUnderlyingObject(Addr, DL);
+      // Set is-on-stack property
+      Prop.setIsOnStack(isa<AllocaInst>(Obj));
+      // Set may-be-captured property
+      Prop.setMayBeCaptured(isa<GlobalValue>(Obj) ||
+                            PointerMayBeCaptured(Addr, true, true));
+      LoadAndStoreProperties.push_back(std::make_pair(I, Prop));
+    } else {
+      LoadInst *Load = cast<LoadInst>(I);
+      Value *Addr = Load->getPointerOperand();
+      CsiLoadStoreProperty Prop;
+      // Update alignment property data
+      Alignment = Load->getAlignment();
+      Prop.setAlignment(Alignment);
+      // Set vtable-access property
+      Prop.setIsVtableAccess(isVtableAccess(Load));
+      // Set constant-data-access-property
+      Prop.setIsConstant(addrPointsToConstantData(Addr));
+      Value *Obj = GetUnderlyingObject(Addr, DL);
+      // Set is-on-stack property
+      Prop.setIsOnStack(isa<AllocaInst>(Obj));
+      // Set may-be-captured property
+      Prop.setMayBeCaptured(isa<GlobalValue>(Obj) ||
+                            PointerMayBeCaptured(Addr, true, true));
+      // Set load-read-before-write-in-bb property
+      bool HasBeenSeen = WriteTargets.count(Addr) > 0;
+      Prop.setLoadReadBeforeWriteInBB(HasBeenSeen);
+      LoadAndStoreProperties.push_back(std::make_pair(I, Prop));
+    }
+  }
+  BBLoadsAndStores.clear();
+}
+
+void CSIImpl::instrumentFunction(Function &F) {
+  // This is required to prevent instrumenting the call to
+  // __csi_module_init from within the module constructor.
+  if (F.empty() || shouldNotInstrumentFunction(F)) {
+    return;
+  }
+
+  SmallVector<std::pair<Instruction *, CsiLoadStoreProperty>, 8>
+    LoadAndStoreProperties;
+  SmallVector<Instruction *, 8> ReturnInstructions;
+  SmallVector<Instruction *, 8> MemIntrinsics;
+  SmallVector<Instruction *, 8> Callsites;
+  SmallVector<BasicBlock *, 8> BasicBlocks;
+  SmallVector<Instruction*, 8> AtomicAccesses;
+
+  // Compile lists of all instrumentation points before anything is modified.
+  for (BasicBlock &BB : F) {
+    SmallVector<Instruction *, 8> BBLoadsAndStores;
+    for (Instruction &I : BB) {
+      if (isAtomic(&I))
+        AtomicAccesses.push_back(&I);
+      else if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+        BBLoadsAndStores.push_back(&I);
+      } else if (isa<ReturnInst>(I)) {
+        ReturnInstructions.push_back(&I);
+      } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+        if (isa<MemIntrinsic>(I)) {
+          MemIntrinsics.push_back(&I);
+        } else {
+          Callsites.push_back(&I);
+        }
+        computeLoadAndStoreProperties(LoadAndStoreProperties, BBLoadsAndStores,
+                                      DL);
+      }
+    }
+    computeLoadAndStoreProperties(LoadAndStoreProperties, BBLoadsAndStores, DL);
+    BasicBlocks.push_back(&BB);
+  }
+
+  uint64_t LocalId = getLocalFunctionID(F);
+
+  // Instrument basic blocks.  Note that we do this before other instrumentation
+  // so that we put this at the beginning of the basic block, and then the
+  // function entry call goes before the call to basic block entry.
+  if (Options.InstrumentBasicBlocks)
+    for (BasicBlock *BB : BasicBlocks)
+      instrumentBasicBlock(*BB);
+
+  // Do this work in a separate loop after copying the iterators so that we
+  // aren't modifying the list as we're iterating.
+  if (Options.InstrumentMemoryAccesses)
+    for (std::pair<Instruction *, CsiLoadStoreProperty> p :
+           LoadAndStoreProperties)
+      instrumentLoadOrStore(p.first, p.second, DL);
+
+  // Instrument atomic memory accesses in any case (they can be used to
+  // implement synchronization).
+  if (Options.InstrumentAtomics)
+    for (Instruction *I : AtomicAccesses)
+      instrumentAtomic(I, DL);
+
+  if (Options.InstrumentMemIntrinsics)
+    for (Instruction *I : MemIntrinsics)
+      instrumentMemIntrinsic(I);
+
+  if (Options.InstrumentCalls)
+    for (Instruction *I : Callsites)
+      instrumentCallsite(I);
+
+  // Instrument function entry/exit points.
+  if (Options.InstrumentFuncEntryExit) {
+    IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt());
+    CsiFuncProperty FuncEntryProp;
+    CsiFuncExitProperty FuncExitProp;
+    Value *FuncId = FunctionFED.localToGlobalId(LocalId, IRB);
+    Value *PropVal = FuncEntryProp.getValue(IRB);
+    insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiFuncEntry,
+                              {FuncId, PropVal});
+
+    for (Instruction *I : ReturnInstructions) {
+      IRBuilder<> IRBRet(I);
+      // uint64_t ExitLocalId = FunctionExitFED.add(F);
+      uint64_t ExitLocalId = FunctionExitFED.add(*I);
+      Value *ExitCsiId = FunctionExitFED.localToGlobalId(ExitLocalId, IRBRet);
+      PropVal = FuncExitProp.getValue(IRBRet);
+      insertConditionalHookCall(I, CsiFuncExit,
+                                {ExitCsiId, FuncId, PropVal});
+    }
+  }
+}
+
+void ComprehensiveStaticInstrumentation::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<CallGraphWrapperPass>();
+}
+
+bool ComprehensiveStaticInstrumentation::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  CallGraph *CG = &getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+  return CSIImpl(M, CG, Options).run();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index c3e323613c7079..f9ba37987a61e9 100644
--- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -105,6 +105,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingLegacyPassPass(Registry);
   initializeControlHeightReductionLegacyPassPass(Registry);
+  initializeCilkSanitizerPass(Registry);
   initializeGCOVProfilerLegacyPassPass(Registry);
   initializePGOInstrumentationGenLegacyPassPass(Registry);
   initializePGOInstrumentationUseLegacyPassPass(Registry);
@@ -117,6 +118,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeSanitizerCoverageModulePass(Registry);
   initializeDataFlowSanitizerPass(Registry);
   initializeEfficiencySanitizerPass(Registry);
+  initializeComprehensiveStaticInstrumentationPass(Registry);
 }
 
 /// LLVMInitializeInstrumentation - C binding for
diff --git a/llvm/lib/Transforms/LLVMBuild.txt b/llvm/lib/Transforms/LLVMBuild.txt
index f061c6d9285e3e..ae57c40a946255 100644
--- a/llvm/lib/Transforms/LLVMBuild.txt
+++ b/llvm/lib/Transforms/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC
+subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Tapir Utils Vectorize ObjCARC
 
 [component_0]
 type = Group
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index e3548ce5cd0afd..688365dfae4676 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -28,6 +28,7 @@ add_llvm_library(LLVMScalarOpts
   LoopDeletion.cpp
   LoopDataPrefetch.cpp
   LoopDistribute.cpp
+  LoopFuse.cpp
   LoopIdiomRecognize.cpp
   LoopInstSimplify.cpp
   LoopInterchange.cpp
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 9861948c8297a9..fcc11e0716f9b5 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1123,8 +1123,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
       continue;
     }
+    if (isa<ReattachInst>(Pred->getTerminator())) {
+      continue;
+    }
 
-    if (Pred->getTerminator()->getNumSuccessors() != 1) {
+    if (Pred->getTerminator()->getNumSuccessors() != 1 &&
+        !isa<DetachInst>(Pred->getTerminator())) {
       if (isa<IndirectBrInst>(Pred->getTerminator())) {
         LLVM_DEBUG(
             dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
@@ -1327,6 +1331,20 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     return false;
   }
 
+  // If we depend on a detach instruction, reject.
+  for (unsigned i = 0, e = NumDeps; i != e; ++i) {
+    MemDepResult DepInfo = Deps[i].getResult();
+    if (!(DepInfo.getInst()))
+      continue;
+    if (isa<DetachInst>(DepInfo.getInst())||
+        isa<SyncInst>(DepInfo.getInst())) {
+      DEBUG(dbgs() << "GVN: Cannot process" << *LI <<
+            " due to dependency on" <<
+            *(DepInfo.getInst()) << "\n");
+      return false;
+    }
+  }
+
   // If this load follows a GEP, see if we can PRE the indices before analyzing.
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
     for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
@@ -2184,6 +2202,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   unsigned NumWithout = 0;
   BasicBlock *PREPred = nullptr;
   BasicBlock *CurrentBlock = CurInst->getParent();
+  BasicBlock *DetachPred = nullptr, *ReattachPred = nullptr;
+  Value *DetachV = nullptr, *ReattachV = nullptr;
 
   // Update the RPO numbers for this function.
   if (InvalidBlockRPONumbers)
@@ -2212,18 +2232,36 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
       break;
     }
 
+    // Ignore reattach predecessors for determining whether to perform
+    // PRE.  These predecessors have the same available values as
+    // their corresponding detach predecessors.
+    if (isa<ReattachInst>(P->getTerminator()))
+      ReattachPred = P;
+
     uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this);
     Value *predV = findLeader(P, TValNo);
+
+    if (isa<DetachInst>(P->getTerminator())) {
+      assert(nullptr == DetachPred && "Multiple detach predecessors found!");
+      DetachPred = P;
+    }
+
     if (!predV) {
-      predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
-      PREPred = P;
-      ++NumWithout;
+      if (!isa<ReattachInst>(P->getTerminator())) {
+        predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+        PREPred = P;
+        ++NumWithout;
+      }
     } else if (predV == CurInst) {
       /* CurInst dominates this predecessor. */
       NumWithout = 2;
       break;
     } else {
       predMap.push_back(std::make_pair(predV, P));
+      if (isa<DetachInst>(P->getTerminator()))
+        DetachV = predV;
+      if (isa<ReattachInst>(P->getTerminator()))
+        ReattachV = predV;
       ++NumWith;
     }
   }
@@ -2233,6 +2271,15 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   if (NumWithout > 1 || NumWith == 0)
     return false;
 
+  // If the reattach predecessor has a value that does not match the
+  // detach predecessor's value, assume that this is not a redundant
+  // instruction.
+  if (ReattachV && ReattachV != DetachV)
+    return false;
+
+  assert((!ReattachPred || DetachPred) &&
+         "Reattach predecessor found with no detach predecessor");
+
   // We may have a case where all predecessors have the instruction,
   // and we just need to insert a phi node. Otherwise, perform
   // insertion.
@@ -2256,7 +2303,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
     // the edge to be split and perform the PRE the next time we iterate
     // on the function.
     unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
-    if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+    if (isCriticalEdge(PREPred->getTerminator(), SuccNum) &&
+        !isa<DetachInst>(PREPred->getTerminator())) {
       toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
       return false;
     }
@@ -2267,6 +2315,9 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
       LLVM_DEBUG(verifyRemoved(PREInstr));
       PREInstr->deleteValue();
       return false;
+    } else if (DetachPred == PREPred && ReattachPred) {
+      assert(nullptr == DetachV && "Detach predecessor already had a value");
+      predMap.push_back(std::make_pair(PREInstr, ReattachPred));
     }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 48de56a02834d5..bf2865332ce880 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -987,8 +987,10 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // because now the condition in this block can be threaded through
   // predecessors of our predecessor block.
   if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
-    const Instruction *TI = SinglePred->getTerminator();
-    if (!TI->isExceptionalTerminator() && TI->getNumSuccessors() == 1 &&
+    const TerminatorInst *TI = SinglePred->getTerminator();
+    if (!TI->isExceptional() &&
+        !isa<SyncInst>(SinglePred->getTerminator()) &&  // Can't remove syncs
+        TI->getNumSuccessors() == 1 &&
         SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
       // If SinglePred was a loop header, BB becomes one.
       if (LoopHeaders.erase(SinglePred))
@@ -1373,7 +1375,8 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
       }
     }
 
-    if (!PredAvailable) {
+    if (!PredAvailable ||
+        isa<ReattachInst>(PredBB->getTerminator())) {
       OneUnavailablePred = PredBB;
       continue;
     }
@@ -1416,6 +1419,9 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
   // unconditional branch, we know that it isn't a critical edge.
   if (PredsScanned.size() == AvailablePreds.size()+1 &&
       OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+    // If the predecessor is a reattach, we can't split the edge
+    if (isa<ReattachInst>(OneUnavailablePred->getTerminator()))
+      return false;
     UnavailablePred = OneUnavailablePred;
   } else if (PredsScanned.size() != AvailablePreds.size()) {
     // Otherwise, we had multiple unavailable predecessors or we had a critical
@@ -1428,8 +1434,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
 
     // Add all the unavailable predecessors to the PredsToSplit list.
     for (BasicBlock *P : predecessors(LoadBB)) {
-      // If the predecessor is an indirect goto, we can't split the edge.
-      if (isa<IndirectBrInst>(P->getTerminator()))
+      // If the predecessor is an indirect goto or a reattach, we
+      // can't split the edge.
+      if (isa<IndirectBrInst>(P->getTerminator()) ||
+          isa<ReattachInst>(P->getTerminator()))
         return false;
 
       if (!AvailablePredSet.count(P))
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index d204654c39157d..d598ec917d8932 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -72,6 +72,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
 #include <algorithm>
 #include <utility>
 using namespace llvm;
@@ -1775,6 +1776,18 @@ bool llvm::promoteLoopAccessesToScalars(
   bool DereferenceableInPH = false;
   bool SafeToInsertStore = false;
 
+  // We cannot speculate loads to values that are stored in a detached
+  // context within the loop.  Precompute whether or not there is a
+  // detach within this loop.
+  bool DetachWithinLoop =
+    isa<DetachInst>(CurLoop->getHeader()->getTerminator());
+  if (!DetachWithinLoop)
+    for (BasicBlock *BB : CurLoop->getBlocks())
+      if (isa<DetachInst>(BB->getTerminator())) {
+        DetachWithinLoop = true;
+        break;
+      }
+
   SmallVector<Instruction *, 64> LoopUses;
 
   // We start with an alignment of one and try to find instructions that allow
@@ -1838,6 +1851,23 @@ bool llvm::promoteLoopAccessesToScalars(
         if (!Store->isUnordered())
           return false;
 
+	// We conservatively avoid promoting stores that are detached
+	// within the loop.  Technically it can be legal to move these
+	// stores -- the program already contains a determinacy race
+	// -- but to preserve the serial execution, we have to avoid
+	// moving stores that are loaded.  For now, we simply avoid
+	// moving these stores.
+	//
+	// TODO: The call to GetDetachedCtx can potentially be
+	// expensive.  Optimize this analysis in the future.
+	if (DetachWithinLoop &&
+	    CurLoop->contains(GetDetachedCtx(Store->getParent())))
+	  return false;
+
+        // Note that we only check GuaranteedToExecute inside the store case
+        // so that we do not introduce stores where they did not exist before
+        // (which would break the LLVM concurrency model).
+
         SawUnorderedAtomic |= Store->isAtomic();
         SawNotAtomic |= !Store->isAtomic();
 
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
new file mode 100644
index 00000000000000..4c90ace351c603
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -0,0 +1,561 @@
+//===------------- LoopFuse.cpp - Loop Fusion Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Fuse two adjacent loops to improve cache locality. Loops are multi-versioned
+/// and unconditionally fused along one version to check for dependence
+/// legality. Legality decides whether to keep the original version or the fused
+/// version or both versions with runtime checks. LoopAccessLegacyAnalysis is used to
+/// check dependence legality.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopFuse.h"
+
+#define DEBUG_TYPE "loop-fuse"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    LFuseVerify("loop-fuse-verify", cl::Hidden,
+                cl::desc("Turn on DominatorTree and LoopInfo verification "
+                         "after Loop Fusion"),
+                cl::init(false));
+
+STATISTIC(NumLoopsFused, "Number of loops fused");
+
+// Replace IncomingBlocks in PHI nodes of @Br successors from Br's parent to
+// @To.
+void LoopFuse::RewritePHI(BranchInst *Br, BasicBlock *To) {
+  assert((Br && To));
+  for (auto *S : Br->successors()) {
+    auto I = S->begin();
+    while (PHINode *P = dyn_cast<PHINode>(&*I)) {
+      P->setIncomingBlock(P->getBasicBlockIndex(Br->getParent()), To);
+      ++I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                     Loop Fusion Implementation.
+// The idea to check fusion legality is by first fusing the loops and then look
+// for fusion preventing dependences. This is done by versioning the loops
+// first. The check is done on versioned loops and one of the version is
+// discarded based on legality's success.
+//===----------------------------------------------------------------------===//
+
+/* Fuse loops @L1 and @L2. Remove ConnectingBlock (CB) and connect L1Latch to
+   L2Header. Loop from L2Latch to L1Header. Make L1's indvar as indvar for the
+   fused loop. Update LI by moving L2Blocks into L1 and call L1 as FusedLoop.
+   Return FusedLoop.
+    L1
+    |               L1Blocks
+    CB     -->      |       \
+    |               L2Blocks/
+    L2                    |/
+*/
+Loop *LoopFuse::FuseLoops(Loop &L1, Loop &L2) {
+  PHINode *P1 = L1.getCanonicalInductionVariable();
+  PHINode *P2 = L2.getCanonicalInductionVariable();
+
+  BranchInst *Br1 = dyn_cast<BranchInst>(L1.getLoopLatch()->getTerminator());
+  BranchInst *Br2 = dyn_cast<BranchInst>(L2.getLoopLatch()->getTerminator());
+
+  // Make Br2 to branch to L1 header based on Br1's condition.
+  unsigned LoopBack = 0;
+  if (Br2->getSuccessor(1) == L2.getHeader())
+    LoopBack = 1;
+  assert((Br2->getSuccessor(LoopBack) == L2.getHeader()));
+  Br2->setSuccessor(LoopBack, L1.getHeader());
+  Br2->setCondition(Br1->getCondition());
+  RewritePHI(Br1, Br2->getParent());
+
+  // Zap L2 preheader and unconditionally branch from L1 latch to L2 header.
+  // L2 preheader is a connecting block and it is known to contain only an
+  // unconditional branch to L2 header.
+  BasicBlock *L2PH = L2.getLoopPreheader(), *L2Header = L2.getHeader();
+  BranchInst *L2PHBr = dyn_cast<BranchInst>(L2PH->getTerminator());
+  RewritePHI(L2PHBr, Br1->getParent());
+  DT->changeImmediateDominator(L2Header, L1.getLoopLatch());
+
+  BranchInst::Create(L2Header, Br1);
+  Br1->eraseFromParent();
+  L2PH->dropAllReferences();
+  L2PHBr->eraseFromParent();
+  L2PH->eraseFromParent();
+  DT->eraseNode(L2PH);
+  LI->removeBlock(L2PH);
+
+  P2->replaceAllUsesWith(P1);
+  P2->eraseFromParent();
+
+  // Update LI.
+  // Move all blocks from L2 to L1.
+  SmallVector<BasicBlock *, 2> L2BBs;
+  for (auto bb = L2.block_begin(), bbe = L2.block_end(); bb != bbe; ++bb)
+    L2BBs.push_back(*bb);
+  for (auto *bb : L2BBs) {
+    LI->removeBlock(bb);
+    L1.addBasicBlockToLoop(bb, *LI);
+  }
+  // Remove L2.
+  SE->forgetLoop(&L2);
+  LI->markAsRemoved(&L2);
+
+  // Update DT: DT changed only at L2PH zap and was updated during zapping.
+
+  return &L1;
+}
+
+/*  Version the given loops along a parallel path and fuse the cloned loops.
+    Check the dependence legality of the fused loop.
+
+    L1PH                       BooleanBB                       BooleanBB
+    |                             /\                              /\
+    L1                        L1PH  L1PH.clone                L1PH  FusedPH
+    |                version  |     |            Fuse along      |  |
+    CB (L1Exit/L2PH)  ---->   L1    L1.clone     -------->      L1  L1Blocks
+    |                         |     |            versioned       |  |       \
+    L2                        CB    CB.clone     path           CB  L2Blocks |
+    |                         |     |                            |  |      |/
+    L2Exit                    L2    L2.clone                    L2  |
+                                \  /                              \ /
+                                L2Exit                         CommonExit
+   CB is ConnectingBlock.
+*/
+bool LoopFuse::DependenceLegal(Loop &L1, Loop &L2) {
+
+  // Version to fuse. LoopVersioning is not used here because:
+  // a. Runtime checks are inserted later.
+  // b. Intermediate VMap updates are required.
+  // Moreover it is convenient for now to just clone and remap.
+  BasicBlock *BooleanBB = L1.getLoopPreheader();
+  BasicBlock *L1PH = SplitEdge(BooleanBB, L1.getHeader(), DT, LI);
+
+  ValueToValueMapTy VMap1;
+  SmallVector<BasicBlock *, 2> ClonedBBs1;
+  Loop *ClonedLoop1 =
+      cloneLoopWithPreheader(L1.getExitBlock(), BooleanBB, &L1, VMap1,
+                             Twine(".L1clone"), LI, DT, ClonedBBs1);
+
+  ValueToValueMapTy VMap2;
+  SmallVector<BasicBlock *, 2> ClonedBBs2;
+  Loop *ClonedLoop2 =
+      cloneLoopWithPreheader(L2.getExitBlock(), L1.getExitBlock(), &L2, VMap2,
+                             Twine(".L2clone"), LI, DT, ClonedBBs2);
+  remapInstructionsInBlocks(ClonedBBs2, VMap2);
+  VMap1[L1.getExitBlock()] = ClonedLoop2->getLoopPreheader();
+  remapInstructionsInBlocks(ClonedBBs1, VMap1);
+
+  // Build the custom VMap by concatenating VMap1 and VMap2.
+  for (auto V : VMap1)
+    VMap[V->first] = V->second;
+  for (auto V : VMap2)
+    VMap[V->first] = V->second;
+
+  // VMap.size() != VMap1.size() + VMap2.size() because of redundants and
+  // L1Exit update in VMap1 above.
+
+  // Branch to either of the versions - using a boolean flag.
+  Instruction *Term = BooleanBB->getTerminator();
+  FusionSwitcher =
+      BranchInst::Create(L1PH, ClonedLoop1->getLoopPreheader(),
+                         ConstantInt::getTrue(L1PH->getContext()), Term);
+  Term->eraseFromParent();
+
+  // The two versions join back at L2 exit. Update DT.
+  if (DT->dominates(L2.getLoopLatch(), L2.getExitBlock()))
+    DT->changeImmediateDominator(L2.getExitBlock(), BooleanBB);
+
+  DEBUG(dbgs() << "ClonedLoop1: " << *ClonedLoop1 << "\n");
+  DEBUG(dbgs() << "ClonedLoop2: " << *ClonedLoop2 << "\n");
+
+  FusedLoop = FuseLoops(*ClonedLoop1, *ClonedLoop2);
+  DEBUG(dbgs() << "FusedLoop: " << *FusedLoop << "\n");
+
+  // Check dependences.
+  DEBUG(dbgs() << "Loop fused on versioned path. Checking dependences...\n");
+  LAI = &LAA->getInfo(FusedLoop);
+  DEBUG(LAI->print(dbgs()));
+
+  auto Dependences = LAI->getDepChecker().getDependences();
+  // TODO@jiahao: Investigate.
+  // if (!Dependences || Dependences->empty()) {
+  //   DEBUG(dbgs() << "Failed to get dependences to check fusion legality!"
+  //                << " Skipping...\n");
+  //   return false;
+  // }
+
+  // Fusion is illegal if there is a backward dependence between memory accesses
+  // whose source was in L1 and sink was in L2. ClonedBBs1 and ClonedBBs2
+  // contain cloned BBs from L1 and L2 respectively. They are used to check the
+  // containment of srouce and sink.
+  for (auto &Dep : *Dependences) {
+    if (Dep.isPossiblyBackward()) {
+      Instruction *Source = Dep.getSource(*LAI);
+      Instruction *Sink = Dep.getDestination(*LAI);
+      if (std::find(ClonedBBs1.begin(), ClonedBBs1.end(),
+                    Source->getParent()) == ClonedBBs1.end())
+        continue;
+      if (std::find(ClonedBBs2.begin(), ClonedBBs2.end(), Sink->getParent()) ==
+          ClonedBBs2.end())
+        continue;
+      DEBUG(dbgs() << "Loop carried backward dependence prevents fusion!\n");
+      return false;
+    }
+  }
+  DEBUG(dbgs() << "Loops are dependence legal to fuse!\n");
+  return true;
+}
+
+// Return true if any of the defs made in @L1 is used inside @L2.
+bool LoopFuse::DefsUsedAcrossLoops(Loop &L1, Loop &L2) {
+  auto DefsUsedOutsideL1 = findDefsUsedOutsideOfLoop(&L1);
+  for (auto *D : DefsUsedOutsideL1) {
+    for (auto *U : D->users()) {
+      if (L2.contains(dyn_cast<Instruction>(U)->getParent()))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool LoopFuse::IsLegalAndProfitable(Loop &L1, Loop &L2) {
+  // Basic legality.
+  if (!L1.empty() || !L2.empty()) {
+    // TODO: Update cloneLoopWithPreheader() to update LoopInfo for subloops
+    // too and LoopFusion can be done for loops at any depth.
+    DEBUG(dbgs() << "Not innermost loops! Skipping...\n");
+    return false;
+  }
+
+  if (L1.getLoopDepth() != L2.getLoopDepth()) {
+    DEBUG(dbgs() << "Loops not at same depth! Skipping...\n");
+    return false;
+  }
+
+  if (!L1.getLoopPreheader() || !L2.getLoopPreheader()) {
+    DEBUG(dbgs() << "No preheader! Skipping...\n");
+    return false;
+  }
+
+  if (!L1.getExitBlock() || !L2.getExitBlock()) {
+    DEBUG(dbgs() << "Single exit block not found! Skipping...\n");
+    return false;
+  }
+
+  // Can fuse only bottom-tested loops and loops with latch being the single
+  // exiting block.
+  if ((L1.getExitingBlock() != L1.getLoopLatch()) ||
+      (L2.getExitingBlock() != L2.getLoopLatch())) {
+    DEBUG(dbgs() << "Not a bottom-tested loop! Skipping...\n");
+    return false;
+  }
+
+  // Can fuse only adjacent loops. Adjacency is defined by:
+  // a. L1Exit has single entry only from L1Latch.
+  // b. L1Exit and L2Preheader are same i.e the block forms the ConnectingBlock.
+  // c. ConnectingBlock just branches unconditionally to L2Header.
+  auto *Br = dyn_cast<BranchInst>(L1.getExitBlock()->begin());
+  if ((L1.getExitBlock()->getSinglePredecessor() != L1.getLoopLatch()) ||
+      (L1.getExitBlock() != L2.getLoopPreheader()) ||
+      (!Br || Br->isConditional())) {
+    DEBUG(dbgs() << "Loops not adjacent! Skipping...\n");
+    return false;
+  }
+
+  // Indvars of both loops is known and canonicalized.
+  PHINode *P1 = L1.getCanonicalInductionVariable();
+  PHINode *P2 = L2.getCanonicalInductionVariable();
+  if (!P1 || !P2) {
+    DEBUG(dbgs() << "Unknown induction variables! Skipping...\n");
+    return false;
+  }
+
+  // P1 and P2 are canonical indvars. Backedge taken count check is enough to
+  // ascertain both loops have same iteration space.
+  if (SE->getBackedgeTakenCount(&L1) != SE->getBackedgeTakenCount(&L2))
+    return false;
+
+  // Cannot fuse if there are uses of L1 defs in L2.
+  if (DefsUsedAcrossLoops(L1, L2))
+    return false;
+
+  // Dependene based legality.
+  if (!DependenceLegal(L1, L2))
+    return false;
+
+  // TODO: Add profitability measures.
+
+  return true;
+}
+
+// Remove Loop @L completely by deleting the BBs and also from @LI, @DT and @SE
+// including preheader. Finally connect the single predecessor (the BooleanBB
+// that contains FusionSwitcher) of preheader to loop exit.
+void LoopFuse::RemoveLoopCompletelyWithPreheader(Loop &L) {
+  DEBUG(dbgs() << "Removing loop: " << L << "\n");
+  BasicBlock *PH = L.getLoopPreheader();
+  BasicBlock *Exit = L.getExitBlock();
+  assert(Exit && "Expected Exit bb and single pred to preheader!");
+
+  // No need to RewritePHIs of Exit block given the Loop is deleted because the
+  // uses remain same if FusedLoop is removed OR uses are already replaced if
+  // original loops are deleted.
+
+  // Branch to Exit block from FusionSwitcher.
+  unsigned SuccNum = 0;
+  if (FusionSwitcher->getSuccessor(1) == PH)
+    SuccNum = 1;
+  assert((FusionSwitcher->getSuccessor(SuccNum) == PH));
+  FusionSwitcher->setSuccessor(SuccNum, Exit);
+  if (DT->dominates(L.getLoopLatch(), Exit)) // L1 removal case.
+    // Exit blocks iDom is FusionSwitcher's block due to versioning.
+    DT->changeImmediateDominator(Exit, FusionSwitcher->getParent());
+
+  // Erase each of the loop blocks. Update SE, DT and LI.
+  SE->forgetLoop(&L);
+  PH->dropAllReferences();
+  for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb) {
+    DT->changeImmediateDominator(*bb, PH);
+    (*bb)->dropAllReferences();
+  }
+
+  PH->eraseFromParent();
+  for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb) {
+    // Now nuke bb and its DT.
+    (*bb)->eraseFromParent();
+    DT->eraseNode(*bb);
+  }
+  DT->eraseNode(PH);
+
+  SmallVector<BasicBlock *, 2> LBBs;
+  for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb)
+    LBBs.push_back(*bb);
+  for (auto *bb : LBBs)
+    LI->removeBlock(bb);
+  if (LI->getLoopFor(PH))
+    LI->removeBlock(PH);
+
+  LI->markAsRemoved(&L);
+}
+
+// Remove FusionSwitcher and branch directly to given loop @L's header. This
+// removes loop's preheader and make FusionSwitcher's block as preheader.
+void LoopFuse::RemoveFusionSwitcher(Loop &L) {
+  assert(FusionSwitcher->isConditional());
+  DEBUG(dbgs() << "Removing FusionSwitcher: " << *FusionSwitcher << "\n");
+
+  BasicBlock *PH = L.getLoopPreheader();
+  assert((PH->size() == 1));
+
+  BranchInst *PHBr = dyn_cast<BranchInst>(PH->getTerminator());
+  assert(PHBr->isUnconditional());
+
+  RewritePHI(PHBr, FusionSwitcher->getParent());
+
+  PHBr->removeFromParent();
+  PHBr->insertBefore(FusionSwitcher);
+  DT->changeImmediateDominator(L.getHeader(), FusionSwitcher->getParent());
+
+  FusionSwitcher->eraseFromParent();
+  PH->eraseFromParent();
+  DT->eraseNode(PH);
+  if (LI->getLoopFor(PH))
+    LI->removeBlock(PH);
+}
+
+// Update the uses of defs that reach outside original loop with the defs made
+// made in fused loop.
+void LoopFuse::UpdateUsesOutsideLoop(Loop &L) {
+  for (auto *D : findDefsUsedOutsideOfLoop(&L)) {
+    auto VI = VMap.find(D);
+    if (VI == VMap.end())
+      continue;
+
+    for (auto *U : D->users()) {
+      if (!L.contains(dyn_cast<Instruction>(U)->getParent())) {
+        if (auto *P = dyn_cast<PHINode>(U)) {
+          // Replace U in PHI with <VMap(D), FusedLoopLatch>
+          for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
+            if (P->getIncomingValue(i) == U) {
+              P->removeIncomingValue(i);
+              P->addIncoming(VI->second, FusedLoop->getLoopLatch());
+            }
+          }
+        } else
+          U->replaceUsesOfWith(D, VI->second);
+      }
+    }
+  }
+}
+
+// Add/update phi for defs that reach uses outside the loop from original loop
+// @L and from fused loop. Insert the phis into fused loop's exit block, which
+// is also the exit block of original L2 loop. @OrigIncomingBlock refers to the
+// block from where a def is reached outside of loop - L2 latch.
+// TODO: This routine is similar to LoopVersioning's addPHINodes(), but
+// rewritten here as access to internal data structures differ.
+void LoopFuse::AddPHIsOutsideLoop(Loop &L, BasicBlock *OrigIncomingBlock) {
+  BasicBlock *PHIBlock = FusedLoop->getExitBlock();
+  assert(PHIBlock && "Unable to find FusedLoop's ExitBlock!");
+
+  for (auto *Inst : findDefsUsedOutsideOfLoop(&L)) {
+    PHINode *PN = nullptr;
+    auto FusedInst = VMap.find(Inst);
+    assert((FusedInst != VMap.end()) &&
+           "Expected an equivalent instruction in fused loop!");
+    // Update/add phi node for this Inst.
+    bool FoundInst = false;
+    for (auto I = PHIBlock->begin(); !FoundInst && (PN = dyn_cast<PHINode>(I));
+         ++I) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); !FoundInst && i != e;
+           ++i)
+        if (PN->getIncomingValue(i) == Inst)
+          FoundInst = true;
+    }
+    if (!PN) {
+      PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lfuse",
+                           &PHIBlock->front());
+
+      for (auto *U : Inst->users())
+        if (!L.contains(dyn_cast<Instruction>(U)->getParent()))
+          U->replaceUsesOfWith(Inst, PN);
+
+      PN->addIncoming(Inst, OrigIncomingBlock);
+    }
+    // Add incoming value from fused loop.
+    PN->addIncoming(FusedInst->second, FusedLoop->getLoopLatch());
+  }
+}
+
+bool LoopFuse::run(Loop &L1, Loop &L2) {
+  assert((LI && LAA && DT && SE));
+  DEBUG(dbgs() << "\nTrying to fuse:\n" << L1 << "AND\n" << L2 << "\n");
+
+  FusionSwitcher = nullptr;
+  FusedLoop = nullptr;
+  VMap.clear();
+  bool Changed = false;
+  if (IsLegalAndProfitable(L1, L2)) {
+    assert((FusedLoop && FusionSwitcher));
+    auto *RuntimePtrChecks = LAI->getRuntimePointerChecking();
+    if (RuntimePtrChecks->Need) {
+      // Add runtime checks and add/update phis in exit block for the defs
+      // reaching from two versions.
+      Instruction *FirstCheck, *LastCheck;
+      std::tie(FirstCheck, LastCheck) = LAI->addRuntimeChecks(FusionSwitcher);
+      // TODO: Add SCEVRuntime checks?
+      FusionSwitcher->setCondition(LastCheck);
+
+      AddPHIsOutsideLoop(L1, L2.getLoopLatch());
+      AddPHIsOutsideLoop(L2, L2.getLoopLatch());
+      FusionKind = VERSIONED_FUSION;
+
+    } else {
+      // Remove original loops and retain FusedLoop. Also update the uses of
+      // defs from original loops with the defs from fused loop.
+      UpdateUsesOutsideLoop(L1);
+      UpdateUsesOutsideLoop(L2);
+      RemoveLoopCompletelyWithPreheader(L1);
+      RemoveLoopCompletelyWithPreheader(L2);
+
+      // Remove FusionSwitcher and directly point to FusedLoop header.
+      if (DT->dominates(FusionSwitcher->getParent(), FusedLoop->getExitBlock()))
+        DT->changeImmediateDominator(FusedLoop->getExitBlock(),
+                                     FusedLoop->getLoopLatch());
+      RemoveFusionSwitcher(*FusedLoop);
+      FusionKind = PURE_FUSION;
+    }
+    ++NumLoopsFused;
+    Changed = true;
+
+  } else {
+    if (FusedLoop) {
+      // Loops were versioned to check legality. Rollback to original state.
+      RemoveLoopCompletelyWithPreheader(*FusedLoop);
+
+      // Remove FusionSwitcher and directly point to L1 header.
+      if (DT->dominates(FusionSwitcher->getParent(), L2.getExitBlock()))
+        DT->changeImmediateDominator(L2.getExitBlock(), L2.getLoopLatch());
+      RemoveFusionSwitcher(L1);
+      FusionKind = REVERTED_FUSION;
+    }
+  }
+
+  if (LFuseVerify) {
+    LI->verify(*DT);
+    DT->verifyDomTree();
+  }
+
+  return Changed;
+}
+
+void PopulateInnermostLoopsOf(Loop &L, SmallVectorImpl<Loop *> &Loops) {
+  if (L.empty())
+    Loops.push_back(&L);
+  for (auto I = L.begin(), E = L.end(); I != E; ++I)
+    PopulateInnermostLoopsOf(**I, Loops);
+}
+
+bool LoopFuse::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  // Populate innermost loops and try a n^2 combination of loop fusion.
+  bool Changed = false;
+  SmallVector<Loop *, 2> Loops;
+  for (auto L = LI->begin(), Le = LI->end(); L != Le; ++L)
+    PopulateInnermostLoopsOf(**L, Loops);
+
+  auto L1 = Loops.begin(), L1e = Loops.end();
+  while (L1 != L1e) {
+    auto L2 = Loops.begin(), L2e = Loops.end();
+    while (L2 != L2e) {
+      if (L1 == L2) {
+        ++L2;
+        continue;
+      }
+      if (run(**L1, **L2)) {
+        // Remove L1 and L2 from Loops and add FusedLoop.
+        Loops.erase(L1);
+        Loops.erase(L2);
+        Loops.push_back(FusedLoop);
+        L1 = L2 = Loops.begin();
+        L1e = L2e = Loops.end();
+        Changed = true;
+      } else
+        ++L2;
+    }
+    ++L1;
+  }
+
+  if (LFuseVerify) {
+    LI->verify(*DT);
+    DT->verifyDomTree();
+    assert((!verifyFunction(F, &dbgs())) && "Function verification failed!");
+  }
+
+  return Changed;
+}
+
+char LoopFuse::ID;
+
+INITIALIZE_PASS_BEGIN(LoopFuse, "loop-fuse", "Loop Fusion", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopFuse, "loop-fuse", "Loop Fusion", false, false)
+
+namespace llvm {
+FunctionPass *createLoopFusePass() { return new LoopFuse(); }
+}
diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index fd22128f7fe6b8..34773d906e0481 100644
--- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -32,6 +32,603 @@ static cl::opt<unsigned> DefaultRotationThreshold(
     "rotation-max-header-size", cl::init(16), cl::Hidden,
     cl::desc("The default maximum header size for automatic loop rotation"));
 
+STATISTIC(NumRotated, "Number of loops rotated");
+
+namespace {
+/// A simple loop rotation transformation.
+class LoopRotate {
+  const unsigned MaxHeaderSize;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  AssumptionCache *AC;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  const SimplifyQuery &SQ;
+
+public:
+  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
+             const TargetTransformInfo *TTI, AssumptionCache *AC,
+             DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ)
+      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+        SQ(SQ) {}
+  bool processLoop(Loop *L);
+
+private:
+  bool rotateLoop(Loop *L, bool SimplifiedLatch);
+  bool simplifyLoopLatch(Loop *L);
+};
+} // end anonymous namespace
+
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader.  If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values.  Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+                                            BasicBlock *OrigPreheader,
+                                            ValueToValueMapTy &ValueMap,
+                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
+  // Remove PHI node entries that are no longer live.
+  BasicBlock::iterator I, E = OrigHeader->end();
+  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+  // as necessary.
+  SSAUpdater SSA(InsertedPHIs);
+  for (I = OrigHeader->begin(); I != E; ++I) {
+    Value *OrigHeaderVal = &*I;
+
+    // If there are no uses of the value (e.g. because it returns void), there
+    // is nothing to rewrite.
+    if (OrigHeaderVal->use_empty())
+      continue;
+
+    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
+
+    // The value now exits in two versions: the initial value in the preheader
+    // and the loop "next" value in the original header.
+    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+    // Visit each use of the OrigHeader instruction.
+    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+                             UE = OrigHeaderVal->use_end();
+         UI != UE;) {
+      // Grab the use before incrementing the iterator.
+      Use &U = *UI;
+
+      // Increment the iterator before removing the use from the list.
+      ++UI;
+
+      // SSAUpdater can't handle a non-PHI use in the same block as an
+      // earlier def. We can easily handle those cases manually.
+      Instruction *UserInst = cast<Instruction>(U.getUser());
+      if (!isa<PHINode>(UserInst)) {
+        BasicBlock *UserBB = UserInst->getParent();
+
+        // The original users in the OrigHeader are already using the
+        // original definitions.
+        if (UserBB == OrigHeader)
+          continue;
+
+        // Users in the OrigPreHeader need to use the value to which the
+        // original definitions are mapped.
+        if (UserBB == OrigPreheader) {
+          U = OrigPreHeaderVal;
+          continue;
+        }
+      }
+
+      // Anything else can be handled by SSAUpdater.
+      SSA.RewriteUse(U);
+    }
+
+    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
+    // intrinsics.
+    SmallVector<DbgValueInst *, 1> DbgValues;
+    llvm::findDbgValues(DbgValues, OrigHeaderVal);
+    for (auto &DbgValue : DbgValues) {
+      // The original users in the OrigHeader are already using the original
+      // definitions.
+      BasicBlock *UserBB = DbgValue->getParent();
+      if (UserBB == OrigHeader)
+        continue;
+
+      // Users in the OrigPreHeader need to use the value to which the
+      // original definitions are mapped and anything else can be handled by
+      // the SSAUpdater. To avoid adding PHINodes, check if the value is
+      // available in UserBB, if not substitute undef.
+      Value *NewVal;
+      if (UserBB == OrigPreheader)
+        NewVal = OrigPreHeaderVal;
+      else if (SSA.HasValueForBlock(UserBB))
+        NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+      else
+        NewVal = UndefValue::get(OrigHeaderVal->getType());
+      DbgValue->setOperand(0,
+                           MetadataAsValue::get(OrigHeaderVal->getContext(),
+                                                ValueAsMetadata::get(NewVal)));
+    }
+  }
+}
+
+/// Propagate dbg.value intrinsics through the newly inserted Phis.
+static void insertDebugValues(BasicBlock *OrigHeader,
+                              SmallVectorImpl<PHINode*> &InsertedPHIs) {
+  ValueToValueMapTy DbgValueMap;
+
+  // Map existing PHI nodes to their dbg.values.
+  for (auto &I : *OrigHeader) {
+    if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+        DbgValueMap.insert({Loc, DbgII});
+    }
+  }
+
+  // Then iterate through the new PHIs and look to see if they use one of the
+  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+  // propagate the info through the new PHI.
+  LLVMContext &C = OrigHeader->getContext();
+  for (auto PHI : InsertedPHIs) {
+    for (auto VI : PHI->operand_values()) {
+      auto V = DbgValueMap.find(VI);
+      if (V != DbgValueMap.end()) {
+        auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+        Instruction *NewDbgII = DbgII->clone();
+        auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+        NewDbgII->setOperand(0, PhiMAV);
+        BasicBlock *Parent = PHI->getParent();
+        NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
+      }
+    }
+  }
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+  // If the loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  // If the loop header is not one of the loop exiting blocks then
+  // either this loop is already rotated or it is not
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExiting(OrigHeader))
+    return false;
+
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (!OrigLatch)
+    return false;
+
+  // Rotate if either the loop latch does *not* exit the loop, or if the loop
+  // latch was just simplified.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
+    return false;
+
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    CodeMetrics Metrics;
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+    if (Metrics.notDuplicatable) {
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+                   << " instructions: ";
+            L->dump());
+      return false;
+    }
+    if (Metrics.convergent) {
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                      "instructions: ";
+            L->dump());
+      return false;
+    }
+    if (Metrics.NumInsts > MaxHeaderSize)
+      return false;
+  }
+
+  // Now, this loop is suitable for rotation.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!OrigPreheader)
+    return false;
+
+  if (isa<SyncInst>(OrigPreheader->getTerminator())) {
+    DEBUG(dbgs() << "LoopRotation: Splitting header due to sync terminator.\n");
+    BasicBlock *NewPreheader = SplitEdge(OrigPreheader, OrigHeader, DT, LI);
+    // SyncInst::Create(NewPreheader, OrigPreheader->getTerminator());
+    // OrigPreheader->getTerminator()->eraseFromParent();
+    OrigPreheader = NewPreheader;
+  }
+
+  // Anything ScalarEvolution may know about this loop or the PHI nodes
+  // in its header will soon be invalidated.
+  if (SE)
+    SE->forgetLoop(L);
+
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  BasicBlock *Exit = BI->getSuccessor(0);
+  BasicBlock *NewHeader = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
+         "Unable to determine loop header and exit blocks");
+
+  // This code assumes that the new header has exactly one predecessor.
+  // Remove any single-entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Begin by walking OrigHeader and populating ValueMap with an entry for
+  // each Instruction.
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  ValueToValueMapTy ValueMap;
+
+  // For PHI nodes, the value available in OldPreHeader is just the
+  // incoming value from OldPreHeader.
+  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+
+  // For the rest of the instructions, either hoist to the OrigPreheader if
+  // possible or create a clone in the OldPreHeader if not.
+  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+
+  // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+  using DbgIntrinsicHash =
+      std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+  auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
+    return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+  };
+  SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+  for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+       I != E; ++I) {
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
+      DbgIntrinsics.insert(makeHash(DII));
+    else
+      break;
+  }
+
+  while (I != E) {
+    Instruction *Inst = &*I++;
+
+    // If the instruction's operands are invariant and it doesn't read or write
+    // memory, then it is safe to hoist.  Doing this doesn't change the order of
+    // execution in the preheader, but does prevent the instruction from
+    // executing in each iteration of the loop.  This means it is safe to hoist
+    // something that might trap, but isn't safe to hoist something that reads
+    // memory (without proving that the loop doesn't write).
+    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+        !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
+        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
+      Inst->moveBefore(LoopEntryBranch);
+      continue;
+    }
+
+    // Otherwise, create a duplicate of the instruction.
+    Instruction *C = Inst->clone();
+
+    // Eagerly remap the operands of the instruction.
+    RemapInstruction(C, ValueMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // Avoid inserting the same intrinsic twice.
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
+      if (DbgIntrinsics.count(makeHash(DII))) {
+        C->deleteValue();
+        continue;
+      }
+
+    // With the operands remapped, see if the instruction constant folds or is
+    // otherwise simplifyable.  This commonly occurs because the entry from PHI
+    // nodes allows icmps and other instructions to fold.
+    Value *V = SimplifyInstruction(C, SQ);
+    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+      // If so, then delete the temporary instruction and stick the folded value
+      // in the map.
+      ValueMap[Inst] = V;
+      if (!C->mayHaveSideEffects()) {
+        C->deleteValue();
+        C = nullptr;
+      }
+    } else {
+      ValueMap[Inst] = C;
+    }
+    if (C) {
+      // Otherwise, stick the new instruction into the new block!
+      C->setName(Inst->getName());
+      C->insertBefore(LoopEntryBranch);
+
+      if (auto *II = dyn_cast<IntrinsicInst>(C))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+    }
+  }
+
+  // Along with all the other instructions, we just cloned OrigHeader's
+  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+  // successors by duplicating their incoming values for OrigHeader.
+  TerminatorInst *TI = OrigHeader->getTerminator();
+  for (BasicBlock *SuccBB : TI->successors())
+    for (BasicBlock::iterator BI = SuccBB->begin();
+         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+  // OrigPreHeader's old terminator (the original branch into the loop), and
+  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+  LoopEntryBranch->eraseFromParent();
+
+
+  SmallVector<PHINode*, 2> InsertedPHIs;
+  // If there were any uses of instructions in the duplicated block outside the
+  // loop, update them, inserting PHI nodes as required
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                  &InsertedPHIs);
+
+  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValues(OrigHeader, InsertedPHIs);
+
+  // NewHeader is now the header of the loop.
+  L->moveToHeader(NewHeader);
+  assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+  // Inform DT about changes to the CFG.
+  if (DT) {
+    // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+    // the DT about the removed edge to the OrigHeader (that got removed).
+    SmallVector<DominatorTree::UpdateType, 3> Updates;
+    Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+    Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+    Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+    DT->applyUpdates(Updates);
+  }
+
+  // At this point, we've finished our major CFG changes.  As part of cloning
+  // the loop into the preheader we've simplified instructions and the
+  // duplicated conditional branch may now be branching on a constant.  If it is
+  // branching on a constant and if that constant means that we enter the loop,
+  // then we fold away the cond branch to an uncond branch.  This simplifies the
+  // loop in cases important for nested loops, and it also means we don't have
+  // to split as many edges.
+  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  if (!isa<ConstantInt>(PHBI->getCondition()) ||
+      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+          NewHeader) {
+    // The conditional branch can't be folded, handle the general case.
+    // Split edges as necessary to preserve LoopSimplify form.
+
+    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
+    BasicBlock *NewPH = SplitCriticalEdge(
+        OrigPreheader, NewHeader,
+        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+    NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor. Note that Exit could be an exit block for multiple
+    // nested loops, causing both of the edges to now be critical and need to
+    // be split.
+    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+    bool SplitLatchEdge = false;
+    for (BasicBlock *ExitPred : ExitPreds) {
+      // We only need to split loop exit edges.
+      Loop *PredLoop = LI->getLoopFor(ExitPred);
+      if (!PredLoop || PredLoop->contains(Exit))
+        continue;
+      if (isa<IndirectBrInst>(ExitPred->getTerminator()))
+        continue;
+      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+      BasicBlock *ExitSplit = SplitCriticalEdge(
+          ExitPred, Exit,
+          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+      ExitSplit->moveBefore(Exit);
+    }
+    assert(SplitLatchEdge &&
+           "Despite splitting all preds, failed to split latch exit?");
+  } else {
+    // We can fold the conditional branch in the preheader, this makes things
+    // simpler. The first step is to remove the extra edge to the Exit block.
+    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
+    PHBI->eraseFromParent();
+
+    // With our CFG finalized, update DomTree if it is available.
+    if (DT) DT->deleteEdge(OrigPreheader, Exit);
+  }
+
+  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+  // Now that the CFG and DomTree are in a consistent state again, try to merge
+  // the OrigHeader block into OrigLatch.  This will succeed if they are
+  // connected by an unconditional branch.  This is just a cleanup so the
+  // emitted code isn't too gross in this common case.
+  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+
+  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+  ++NumRotated;
+  return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+                                  BasicBlock::iterator End, Loop *L) {
+  bool seenIncrement = false;
+  bool MultiExitLoop = false;
+
+  if (!L->getExitingBlock())
+    MultiExitLoop = true;
+
+  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+    if (!isSafeToSpeculativelyExecute(&*I))
+      return false;
+
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::GetElementPtr:
+      // GEPs are cheap if all indices are constant.
+      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+        return false;
+      // fall-thru to increment case
+      LLVM_FALLTHROUGH;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
+      Value *IVOpnd =
+          !isa<Constant>(I->getOperand(0))
+              ? I->getOperand(0)
+              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
+      if (!IVOpnd)
+        return false;
+
+      // If increment operand is used outside of the loop, this speculation
+      // could cause extra live range interference.
+      if (MultiExitLoop) {
+        for (User *UseI : IVOpnd->users()) {
+          auto *UserInst = cast<Instruction>(UseI);
+          if (!L->contains(UserInst))
+            return false;
+        }
+      }
+
+      if (seenIncrement)
+        return false;
+      seenIncrement = true;
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // ignore type conversions
+      break;
+    }
+  }
+  return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || Latch->hasAddressTaken())
+    return false;
+
+  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!Jmp || !Jmp->isUnconditional())
+    return false;
+
+  BasicBlock *LastExit = Latch->getSinglePredecessor();
+  if (!LastExit || !L->isLoopExiting(LastExit))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+  if (!BI)
+    return false;
+
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+    return false;
+
+  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+               << LastExit->getName() << "\n");
+
+  // Hoist the instructions from Latch into LastExit.
+  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
+                                 Latch->begin(), Jmp->getIterator());
+
+  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+  BasicBlock *Header = Jmp->getSuccessor(0);
+  assert(Header == L->getHeader() && "expected a backward branch");
+
+  // Remove Latch from the CFG so that LastExit becomes the new Latch.
+  BI->setSuccessor(FallThruPath, Header);
+  Latch->replaceSuccessorsPhiUsesWith(LastExit);
+  Jmp->eraseFromParent();
+
+  // Nuke the Latch block.
+  assert(Latch->empty() && "unable to evacuate Latch");
+  LI->removeBlock(Latch);
+  if (DT)
+    DT->eraseNode(Latch);
+  Latch->eraseFromParent();
+  return true;
+}
+
+/// Rotate \c L, and return true if any modification was made.
+bool LoopRotate::processLoop(Loop *L) {
+  // Save the loop metadata.
+  MDNode *LoopMD = L->getLoopID();
+
+  // Simplify the loop latch before attempting to rotate the header
+  // upward. Rotation may not be needed if the loop tail can be folded into the
+  // loop exit.
+  bool SimplifiedLatch = simplifyLoopLatch(L);
+
+  bool MadeChange = rotateLoop(L, SimplifiedLatch);
+  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+         "Loop latch should be exiting after loop-rotate.");
+
+  // Restore the loop metadata.
+  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+  if ((MadeChange || SimplifiedLatch) && LoopMD)
+    L->setLoopID(LoopMD);
+
+  return MadeChange || SimplifiedLatch;
+}
+
 LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
     : EnableHeaderDuplication(EnableHeaderDuplication) {}
 
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 2f6ed05c023b1e..c4dccc91b54056 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -611,6 +611,14 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
 
   void visitReturnInst(ReturnInst &I);
   void visitTerminator(Instruction &TI);
+  void visitReattachInst(ReattachInst &I) {
+    markOverdefined(&I);
+    visitTerminator(I);
+  }
+  void visitSyncInst(SyncInst &I) {
+    markOverdefined(&I);
+    visitTerminator(I);
+  }
 
   void visitCastInst(CastInst &I);
   void visitSelectInst(SelectInst &I);
@@ -734,6 +742,13 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
     return;
   }
 
+  if (isa<DetachInst>(&TI) ||
+      isa<ReattachInst>(&TI) ||
+      isa<SyncInst>(&TI)) {
+    // All destinations are executable.
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
   LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
   llvm_unreachable("SCCP: Don't know how to handle this terminator!");
 }
@@ -745,6 +760,66 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
   // be more aggressive and try to consider edges which haven't been marked
   // yet, but there isn't any need.)
   return KnownFeasibleEdges.count(Edge(From, To));
+  assert(BBExecutable.count(To) && "Dest should always be alive!");
+
+  // Make sure the source basic block is executable!!
+  if (!BBExecutable.count(From)) return false;
+
+  // Check to make sure this edge itself is actually feasible now.
+  TerminatorInst *TI = From->getTerminator();
+  if (auto *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isUnconditional())
+      return true;
+
+    LatticeVal BCValue = getValueState(BI->getCondition());
+
+    // Overdefined condition variables mean the branch could go either way,
+    // undef conditions mean that neither edge is feasible yet.
+    ConstantInt *CI = BCValue.getConstantInt();
+    if (!CI)
+      return !BCValue.isUnknown();
+
+    // Constant condition variables mean the branch can only go a single way.
+    return BI->getSuccessor(CI->isZero()) == To;
+  }
+
+  // Unwinding instructions successors are always executable.
+  if (TI->isExceptional())
+    return true;
+
+  if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+    if (SI->getNumCases() < 1)
+      return true;
+
+    LatticeVal SCValue = getValueState(SI->getCondition());
+    ConstantInt *CI = SCValue.getConstantInt();
+
+    if (!CI)
+      return !SCValue.isUnknown();
+
+    return SI->findCaseValue(CI)->getCaseSuccessor() == To;
+  }
+
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+
+    if (!Addr)
+      return !IBRValue.isUnknown();
+
+    // At this point, the indirectbr is branching on a blockaddress.
+    return Addr->getBasicBlock() == To;
+  }
+
+  if (isa<ReattachInst>(TI) ||
+      isa<DetachInst>(TI) ||
+      isa<SyncInst>(TI))
+    return true;
+
+  LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
+  llvm_unreachable("SCCP: Don't know how to handle this terminator!");
 }
 
 // visit Implementations - Something changed in this instruction, either an
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 68ca6c47c8f1a4..cef9cac89db330 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3867,6 +3867,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       // a direct store) as needing to be resplit because it is no longer
       // promotable.
       if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+        assert((!FunctionContainsDetach ||
+                isAllocaParallelPromotable(OtherAI, *DT)) &&
+               "Alloca must be promotable");
         ResplitPromotableAllocas.insert(OtherAI);
         Worklist.insert(OtherAI);
       } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
@@ -3983,6 +3986,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     if (!SplitLoads) {
       if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
         assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        assert((!FunctionContainsDetach ||
+                isAllocaParallelPromotable(OtherAI, *DT)) &&
+               "Alloca must be promotable");
         ResplitPromotableAllocas.insert(OtherAI);
         Worklist.insert(OtherAI);
       } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
@@ -4152,9 +4158,16 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
       break;
     }
 
+  // Check if any detaches block promotion.
+  Promotable &= (!FunctionContainsDetach ||
+                 isAllocaParallelPromotable(NewAI, *DT));
+
   if (Promotable) {
     if (PHIUsers.empty() && SelectUsers.empty()) {
       // Promote the alloca.
+        assert((!FunctionContainsDetach ||
+              isAllocaParallelPromotable(NewAI, *DT)) &&
+             "Alloca must be promotable");
       PromotableAllocas.push_back(NewAI);
     } else {
       // If we have either PHIs or Selects to speculate, add them to those
@@ -4496,11 +4509,28 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
   DT = &RunDT;
   AC = &RunAC;
 
-  BasicBlock &EntryBB = F.getEntryBlock();
-  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
-       I != E; ++I) {
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
-      Worklist.insert(AI);
+  // BasicBlock &EntryBB = F.getEntryBlock();
+  // Scan the function to get its entry block and all entry blocks of detached
+  // CFG's.  We can perform this scan for entry blocks once for the function,
+  // because this pass preserves the CFG.
+  SmallVector<BasicBlock *, 4> EntryBlocks;
+  FunctionContainsDetach = false;
+  EntryBlocks.push_back(&F.getEntryBlock());
+  for (BasicBlock &BB : F)
+    if (BasicBlock *Pred = BB.getUniquePredecessor())
+      if (DetachInst *DI = dyn_cast<DetachInst>(Pred->getTerminator())) {
+        FunctionContainsDetach = true;
+        if (DI->getDetached() == &BB)
+          EntryBlocks.push_back(&BB);
+      }
+
+  for (BasicBlock *BB : EntryBlocks) {
+    BasicBlock &EntryBB = *BB;
+    for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
+         I != E; ++I) {
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+        Worklist.insert(AI);
+    }
   }
 
   bool Changed = false;
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index 976daf4c78c2fd..67571aeeaf12c6 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -65,6 +65,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLegacyLoopSinkPassPass(Registry);
   initializeLoopDataPrefetchLegacyPassPass(Registry);
   initializeLoopDeletionLegacyPassPass(Registry);
+  initializeLoopFusePass(Registry);
   initializeLoopAccessLegacyAnalysisPass(Registry);
   initializeLoopInstSimplifyLegacyPassPass(Registry);
   initializeLoopInterchangePass(Registry);
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index b7b1db76b49237..f60e856a4d4285 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -38,6 +38,8 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include <utility>
@@ -143,6 +145,71 @@ static bool mergeEmptyReturnBlocks(Function &F) {
   return Changed;
 }
 
+static bool removeUselessSyncs(Function &F) {
+  bool Changed = false;
+  // Scan all the blocks in the function
+ check:
+  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
+    BasicBlock *BB = &*BBI++;
+    if (SyncInst *Sync = dyn_cast<SyncInst>(BB->getTerminator())) {
+      // Walk the CFG backwards to try to find a reaching detach instruction.
+      bool ReachingDetach = false;
+      SmallPtrSet<BasicBlock *, 32> Visited;
+      SmallVector<BasicBlock *, 32> WorkList;
+      WorkList.push_back(BB);
+      while (!WorkList.empty()) {
+        BasicBlock *PBB = WorkList.pop_back_val();
+        if (!Visited.insert(PBB).second)
+          continue;
+
+        for (pred_iterator PI = pred_begin(PBB), PE = pred_end(PBB);
+             PI != PE; ++PI) {
+          BasicBlock *Pred = *PI;
+          TerminatorInst *PT = Pred->getTerminator();
+          // Stop the traversal at the entry block of a detached CFG.
+          if (DetachInst *DI = dyn_cast<DetachInst>(PT)) {
+            if (DI->getDetached() == PBB)
+              continue;
+            else // DI->getContinue() == PBB
+              // This detach reaches the sync through the continuation edge.
+              ReachingDetach = true;
+          }
+          if (ReachingDetach)
+            break;
+
+          // Ignore predecessors via a reattach, which belong to child detached
+          // contexts.
+          if (isa<ReattachInst>(PT))
+            continue;
+
+          // For a predecessor terminated by a sync instruction, check the sync
+          // region it belongs to.  If the sync belongs to a different sync
+          // region, add the block that starts that region.  Otherwise, ignore
+          // the predecessor.
+          if (SyncInst *SI = dyn_cast<SyncInst>(PT)) {
+            if (SI->getSyncRegion() != Sync->getSyncRegion())
+              for (User *U : SI->getSyncRegion()->users())
+                if (isa<DetachInst>(U))
+                  WorkList.push_back(cast<Instruction>(U)->getParent());
+            continue;
+          }
+
+          WorkList.push_back(Pred);
+        }
+      }
+
+      // If no detach reaches this sync, then this sync can be removed.
+      if (!ReachingDetach) {
+        BasicBlock* Succ = Sync->getSuccessor(0);
+        ReplaceInstWithInst(Sync, BranchInst::Create(Succ));
+        Changed = true;
+        if (MergeBlockIntoPredecessor(Succ)) goto check;
+      }
+    }
+  }
+  return Changed;
+}
+
 /// Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
@@ -176,6 +243,7 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
   EverChanged |= iterativelySimplifyCFG(F, TTI, Options);
+  EverChanged |= removeUselessSyncs(F);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -191,6 +259,7 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
   do {
     EverChanged = iterativelySimplifyCFG(F, TTI, Options);
     EverChanged |= removeUnreachableBlocks(F);
+    EverChanged |= removeUselessSyncs(F);
   } while (EverChanged);
 
   return true;
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 0f6db21f73b60e..7a24ab744b4b75 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -684,6 +684,38 @@ static bool eliminateRecursiveTailCall(
   return true;
 }
 
+static void getReturnBlocksToSync(
+    BasicBlock *Entry, SyncInst *Sync,
+    SmallVectorImpl<BasicBlock *> &ReturnBlocksToSync) {
+  // Walk the CFG from the entry block, stopping traversal at any sync within
+  // the same region.  Record all blocks found that are terminated by a return
+  // instruction.
+  Value *SyncRegion = Sync->getSyncRegion();
+  SmallVector<BasicBlock *, 8> WorkList;
+  SmallPtrSet<BasicBlock *, 8> Visited;
+  WorkList.push_back(Entry);
+  while (!WorkList.empty()) {
+    BasicBlock *BB = WorkList.pop_back_val();
+    if (!Visited.insert(BB).second)
+      continue;
+
+    // Skip paths that are synced within the same region.
+    if (SyncInst *SI = dyn_cast<SyncInst>(BB->getTerminator()))
+      if (SI->getSyncRegion() == SyncRegion)
+        continue;
+
+    // If we find a return, we must add a sync before it if we eliminate a
+    // recursive tail call.
+    if (isa<ReturnInst>(BB->getTerminator()))
+      ReturnBlocksToSync.push_back(BB);
+
+    // Queue up successors to search.
+    for (BasicBlock *Succ : successors(BB))
+      if (Succ != Sync->getParent())
+        WorkList.push_back(Succ);
+  }
+}
+
 static bool foldReturnAndProcessPred(
     BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry,
     bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
@@ -700,13 +732,17 @@ static bool foldReturnAndProcessPred(
   // predecessors and perform TRE there. Look for predecessors that end
   // in unconditional branch and recursive call(s).
   SmallVector<BranchInst*, 8> UncondBranchPreds;
+  SmallVector<SyncInst*, 8> SyncPreds;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *Pred = *PI;
     Instruction *PTI = Pred->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
       if (BI->isUnconditional())
         UncondBranchPreds.push_back(BI);
+    if (SyncInst *SI = dyn_cast<SyncInst>(PTI))
+      SyncPreds.push_back(SI);
   }
+  BasicBlock *OldEntryBlock = &BB->getParent()->getEntryBlock();
 
   while (!UncondBranchPreds.empty()) {
     BranchInst *BI = UncondBranchPreds.pop_back_val();
@@ -730,6 +766,68 @@ static bool foldReturnAndProcessPred(
     }
   }
 
+  // If this loop runs, then the previous one could not have erased BB, because
+  // BB has a predecessor that is not an unconditional branch.
+  while (!SyncPreds.empty()) {
+    SyncInst *SI = SyncPreds.pop_back_val();
+    BasicBlock *Pred = SI->getParent();
+    if (CallInst *CI =
+        findTRECandidate(SI, CannotTailCallElimCallsMarkedTail, TTI)) {
+      // Check that all instructions between the candidate tail call and the
+      // sync can be moved above the call.  In particular, we disallow
+      // accumulator recursion elimination for tail calls before a sync.
+      BasicBlock::iterator BBI(CI);
+      for (++BBI; &*BBI != SI; ++BBI)
+        if (!canMoveAboveCall(&*BBI, CI, AA))
+          break;
+      if (&*BBI != SI)
+        continue;
+
+      // Get the sync region for this sync.
+      Value *SyncRegion = SI->getSyncRegion();
+
+      // Check that the sync region begins in the entry block of the function.
+      if (cast<Instruction>(SyncRegion)->getParent() != OldEntryBlock) {
+        DEBUG(dbgs() << "Cannot eliminate tail call " << *CI <<
+              ": sync region does not start in entry block.");
+        continue;
+      }
+
+      // Get returns reachable from newly created loop.
+      SmallVector<BasicBlock *, 8> ReturnBlocksToSync;
+      getReturnBlocksToSync(OldEntryBlock, SI, ReturnBlocksToSync);
+
+      // Remove the sync.
+      ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
+
+      // Cleanup: if all predecessors of BB have been eliminated by
+      // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
+      // because the ret instruction in there is still using a value which
+      // eliminateRecursiveTailCall will attempt to remove.
+      if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+        BB->eraseFromParent();
+
+      bool EliminatedTail =
+        eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
+                                   ArgumentPHIs, AA);
+
+      // If a recursive tail was eliminated, fix up the syncs and sync region in
+      // the CFG.
+      if (EliminatedTail) {
+        // Move the sync region start to the new entry block.
+        BasicBlock *NewEntry = &OldEntry->getParent()->getEntryBlock();
+        cast<Instruction>(SyncRegion)->moveBefore(&*(NewEntry->begin()));
+        // Insert syncs before relevant return blocks.
+        for (BasicBlock *RetBlock : ReturnBlocksToSync) {
+          BasicBlock *NewRetBlock = SplitBlock(RetBlock,
+                                               RetBlock->getTerminator());
+          ReplaceInstWithInst(RetBlock->getTerminator(),
+                              SyncInst::Create(NewRetBlock, SyncRegion));
+        }
+        Change = true;
+      }
+    }
+  }
   return Change;
 }
 
diff --git a/llvm/lib/Transforms/Tapir/CMakeLists.txt b/llvm/lib/Transforms/Tapir/CMakeLists.txt
new file mode 100644
index 00000000000000..568558d64e84ae
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_llvm_library(LLVMTapirOpts
+  CilkABI.cpp
+  SmallBlock.cpp
+  RedundantSpawn.cpp
+  SpawnRestructure.cpp
+  SpawnUnswitch.cpp
+  SyncElimination.cpp
+  LowerToCilk.cpp
+  LoopSpawning.cpp
+  Outline.cpp
+  Tapir.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Tapir
+  )
+
+add_dependencies(LLVMTapirOpts intrinsics_gen)
diff --git a/llvm/lib/Transforms/Tapir/CilkABI.cpp b/llvm/lib/Transforms/Tapir/CilkABI.cpp
new file mode 100644
index 00000000000000..bf679d2e0c5377
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/CilkABI.cpp
@@ -0,0 +1,1344 @@
+//===- CilkABI.cpp - Lower Tapir into Cilk runtime system calls -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CilkABI interface, which is used to convert Tapir
+// instructions -- detach, reattach, and sync -- to calls into the Cilk
+// runtime system.  This interface does the low-level dirty work of passes
+// such as LowerToCilk.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Tapir/CilkABI.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Transforms/Tapir/Outline.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cilkabi"
+
+/// Helper typedefs for cilk struct TypeBuilders.
+typedef llvm::TypeBuilder<__cilkrts_stack_frame, false> StackFrameBuilder;
+typedef llvm::TypeBuilder<__cilkrts_worker, false> WorkerBuilder;
+typedef llvm::TypeBuilder<__cilkrts_pedigree, false> PedigreeBuilder;
+
+/// Helper methods for storing to and loading from struct fields.
+static Value *GEP(IRBuilder<> &B, Value *Base, int field) {
+  // return B.CreateStructGEP(cast<PointerType>(Base->getType()),
+  //                          Base, field);
+  return B.CreateConstInBoundsGEP2_32(nullptr, Base, 0, field);
+}
+
+static void StoreField(IRBuilder<> &B, Value *Val, Value *Dst, int field,
+                       bool isVolatile = false) {
+  B.CreateStore(Val, GEP(B, Dst, field), isVolatile);
+}
+
+static Value *LoadField(IRBuilder<> &B, Value *Src, int field,
+                        bool isVolatile = false) {
+  return B.CreateLoad(GEP(B, Src, field), isVolatile);
+}
+
+/// \brief Emit inline assembly code to save the floating point
+/// state, for x86 Only.
+static void EmitSaveFloatingPointState(IRBuilder<> &B, Value *SF) {
+  typedef void (AsmPrototype)(uint32_t*, uint16_t*);
+  llvm::FunctionType *FTy =
+    TypeBuilder<AsmPrototype, false>::get(B.getContext());
+
+  Value *Asm = InlineAsm::get(FTy,
+                              "stmxcsr $0\n\t" "fnstcw $1",
+                              "*m,*m,~{dirflag},~{fpsr},~{flags}",
+                              /*sideeffects*/ true);
+
+  Value * args[2] = {
+    GEP(B, SF, StackFrameBuilder::mxcsr),
+    GEP(B, SF, StackFrameBuilder::fpcsr)
+  };
+
+  B.CreateCall(Asm, args);
+}
+
+/// \brief Helper to find a function with the given name, creating it if it
+/// doesn't already exist. If the function needed to be created then return
+/// false, signifying that the caller needs to add the function body.
+template <typename T>
+static bool GetOrCreateFunction(const char *FnName, Module& M,
+                                Function *&Fn,
+                                Function::LinkageTypes Linkage =
+                                Function::InternalLinkage,
+                                bool DoesNotThrow = true) {
+  LLVMContext &Ctx = M.getContext();
+
+  Fn = M.getFunction(FnName);
+
+  // if the function already exists then let the
+  // caller know that it is complete
+  if (Fn)
+    return true;
+
+  // Otherwise we have to create it
+  FunctionType *FTy = TypeBuilder<T, false>::get(Ctx);
+  Fn = Function::Create(FTy, Linkage, FnName, &M);
+
+  // Set nounwind if it does not throw.
+  if (DoesNotThrow)
+    Fn->setDoesNotThrow();
+
+  // and let the caller know that the function is incomplete
+  // and the body still needs to be added
+  return false;
+}
+
+/// \brief Emit a call to the CILK_SETJMP function.
+static CallInst *EmitCilkSetJmp(IRBuilder<> &B, Value *SF, Module& M) {
+  LLVMContext &Ctx = M.getContext();
+
+  // We always want to save the floating point state too
+  EmitSaveFloatingPointState(B, SF);
+
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+  Type *Int8PtrTy = Type::getInt8PtrTy(Ctx);
+
+  // Get the buffer to store program state
+  // Buffer is a void**.
+  Value *Buf = GEP(B, SF, StackFrameBuilder::ctx);
+
+  // Store the frame pointer in the 0th slot
+  Value *FrameAddr =
+    B.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::frameaddress),
+                 ConstantInt::get(Int32Ty, 0));
+
+  Value *FrameSaveSlot = GEP(B, Buf, 0);
+  B.CreateStore(FrameAddr, FrameSaveSlot, /*isVolatile=*/true);
+
+  // Store stack pointer in the 2nd slot
+  Value *StackAddr = B.CreateCall(
+      Intrinsic::getDeclaration(&M, Intrinsic::stacksave));
+
+  Value *StackSaveSlot = GEP(B, Buf, 2);
+  B.CreateStore(StackAddr, StackSaveSlot, /*isVolatile=*/true);
+
+  Buf = B.CreateBitCast(Buf, Int8PtrTy);
+
+  // Call LLVM's EH setjmp, which is lightweight.
+  Value* F = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setjmp);
+
+  CallInst *SetjmpCall = B.CreateCall(F, Buf);
+  SetjmpCall->setCanReturnTwice();
+
+  return SetjmpCall;
+}
+
+/// \brief Get or create a LLVM function for __cilkrts_pop_frame.
+/// It is equivalent to the following C code
+///
+/// __cilkrts_pop_frame(__cilkrts_stack_frame *sf) {
+///   sf->worker->current_stack_frame = sf->call_parent;
+///   sf->call_parent = 0;
+/// }
+static Function *Get__cilkrts_pop_frame(Module &M) {
+  Function *Fn = 0;
+
+  if (GetOrCreateFunction<cilk_func>("__cilkrts_pop_frame", M, Fn))
+    return Fn;
+
+  // If we get here we need to add the function body
+  LLVMContext &Ctx = M.getContext();
+
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn);
+  IRBuilder<> B(Entry);
+
+  // sf->worker->current_stack_frame = sf.call_parent;
+  StoreField(B,
+             LoadField(B, SF, StackFrameBuilder::call_parent,
+                       /*isVolatile=*/true),
+             LoadField(B, SF, StackFrameBuilder::worker,
+                       /*isVolatile=*/true),
+             WorkerBuilder::current_stack_frame,
+             /*isVolatile=*/true);
+
+  // sf->call_parent = 0;
+  StoreField(B,
+             Constant::getNullValue(
+                 TypeBuilder<__cilkrts_stack_frame*, false>::get(Ctx)),
+             SF, StackFrameBuilder::call_parent, /*isVolatile=*/true);
+
+  B.CreateRetVoid();
+
+  Fn->addFnAttr(Attribute::InlineHint);
+
+  return Fn;
+}
+
+/// \brief Get or create a LLVM function for __cilkrts_detach.
+/// It is equivalent to the following C code
+///
+/// void __cilkrts_detach(struct __cilkrts_stack_frame *sf) {
+///   struct __cilkrts_worker *w = sf->worker;
+///   struct __cilkrts_stack_frame *volatile *tail = w->tail;
+///
+///   sf->spawn_helper_pedigree = w->pedigree;
+///   sf->call_parent->parent_pedigree = w->pedigree;
+///
+///   w->pedigree.rank = 0;
+///   w->pedigree.next = &sf->spawn_helper_pedigree;
+///
+///   *tail++ = sf->call_parent;
+///   w->tail = tail;
+///
+///   sf->flags |= CILK_FRAME_DETACHED;
+/// }
+static Function *Get__cilkrts_detach(Module &M) {
+  Function *Fn = 0;
+
+  if (GetOrCreateFunction<cilk_func>("__cilkrts_detach", M, Fn))
+    return Fn;
+
+  // If we get here we need to add the function body
+  LLVMContext &Ctx = M.getContext();
+
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn);
+  IRBuilder<> B(Entry);
+
+  // struct __cilkrts_worker *w = sf->worker;
+  Value *W = LoadField(B, SF, StackFrameBuilder::worker,
+                       /*isVolatile=*/true);
+
+  // __cilkrts_stack_frame *volatile *tail = w->tail;
+  Value *Tail = LoadField(B, W, WorkerBuilder::tail,
+                          /*isVolatile=*/true);
+
+  // sf->spawn_helper_pedigree = w->pedigree;
+  StoreField(B,
+             LoadField(B, W, WorkerBuilder::pedigree),
+             SF, StackFrameBuilder::parent_pedigree);
+
+  // sf->call_parent->parent_pedigree = w->pedigree;
+  StoreField(B,
+             LoadField(B, W, WorkerBuilder::pedigree),
+             LoadField(B, SF, StackFrameBuilder::call_parent),
+             StackFrameBuilder::parent_pedigree);
+
+  // w->pedigree.rank = 0;
+  {
+    StructType *STy = PedigreeBuilder::get(Ctx);
+    llvm::Type *Ty = STy->getElementType(PedigreeBuilder::rank);
+    StoreField(B,
+               ConstantInt::get(Ty, 0),
+               GEP(B, W, WorkerBuilder::pedigree),
+               PedigreeBuilder::rank);
+  }
+
+  // w->pedigree.next = &sf->spawn_helper_pedigree;
+  StoreField(B,
+             GEP(B, SF, StackFrameBuilder::parent_pedigree),
+             GEP(B, W, WorkerBuilder::pedigree),
+             PedigreeBuilder::next);
+
+  // *tail++ = sf->call_parent;
+  B.CreateStore(LoadField(B, SF, StackFrameBuilder::call_parent,
+                          /*isVolatile=*/true),
+                Tail, /*isVolatile=*/true);
+  Tail = B.CreateConstGEP1_32(Tail, 1);
+
+  // w->tail = tail;
+  StoreField(B, Tail, W, WorkerBuilder::tail, /*isVolatile=*/true);
+
+  // sf->flags |= CILK_FRAME_DETACHED;
+  {
+    Value *F = LoadField(B, SF, StackFrameBuilder::flags, /*isVolatile=*/true);
+    F = B.CreateOr(F, ConstantInt::get(F->getType(), CILK_FRAME_DETACHED));
+    StoreField(B, F, SF, StackFrameBuilder::flags, /*isVolatile=*/true);
+  }
+
+  B.CreateRetVoid();
+
+  Fn->addFnAttr(Attribute::InlineHint);
+
+  return Fn;
+}
+
+/// \brief Get or create a LLVM function for __cilk_sync.
+/// Calls to this function is always inlined, as it saves
+/// the current stack/frame pointer values. This function must be marked
+/// as returns_twice to allow it to be inlined, since the call to setjmp
+/// is marked returns_twice.
+///
+/// It is equivalent to the following C code
+///
+/// void __cilk_sync(struct __cilkrts_stack_frame *sf) {
+///   if (sf->flags & CILK_FRAME_UNSYNCHED) {
+///     sf->parent_pedigree = sf->worker->pedigree;
+///     SAVE_FLOAT_STATE(*sf);
+///     if (!CILK_SETJMP(sf->ctx))
+///       __cilkrts_sync(sf);
+///     else if (sf->flags & CILK_FRAME_EXCEPTING)
+///       __cilkrts_rethrow(sf);
+///   }
+///   ++sf->worker->pedigree.rank;
+/// }
+///
+/// With exceptions disabled in the compiler, the function
+/// does not call __cilkrts_rethrow()
+static Function *GetCilkSyncFn(Module &M, bool instrument = false) {
+  Function *Fn = nullptr;
+
+  if (GetOrCreateFunction<cilk_func>("__cilk_sync", M, Fn,
+                                     Function::InternalLinkage,
+                                     /*doesNotThrow*/false))
+    return Fn;
+
+  // If we get here we need to add the function body
+  LLVMContext &Ctx = M.getContext();
+
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "cilk.sync.test", Fn);
+  BasicBlock *SaveState = BasicBlock::Create(Ctx, "cilk.sync.savestate", Fn);
+  BasicBlock *SyncCall = BasicBlock::Create(Ctx, "cilk.sync.runtimecall", Fn);
+  BasicBlock *Excepting = BasicBlock::Create(Ctx, "cilk.sync.excepting", Fn);
+  // TODO: Detect whether exceptions are needed.
+  BasicBlock *Rethrow = BasicBlock::Create(Ctx, "cilk.sync.rethrow", Fn);
+  BasicBlock *Exit = BasicBlock::Create(Ctx, "cilk.sync.end", Fn);
+
+  // Entry
+  {
+    IRBuilder<> B(Entry);
+
+    if (instrument)
+      // cilk_sync_begin
+      B.CreateCall(CILK_CSI_FUNC(sync_begin, M), SF);
+
+    // if (sf->flags & CILK_FRAME_UNSYNCHED)
+    Value *Flags = LoadField(B, SF, StackFrameBuilder::flags,
+                             /*isVolatile=*/true);
+    Flags = B.CreateAnd(Flags,
+                        ConstantInt::get(Flags->getType(),
+                                         CILK_FRAME_UNSYNCHED));
+    Value *Zero = ConstantInt::get(Flags->getType(), 0);
+    Value *Unsynced = B.CreateICmpEQ(Flags, Zero);
+    B.CreateCondBr(Unsynced, Exit, SaveState);
+  }
+
+  // SaveState
+  {
+    IRBuilder<> B(SaveState);
+
+    // sf.parent_pedigree = sf.worker->pedigree;
+    StoreField(B,
+               LoadField(B, LoadField(B, SF, StackFrameBuilder::worker,
+                                      /*isVolatile=*/true),
+                         WorkerBuilder::pedigree),
+               SF, StackFrameBuilder::parent_pedigree);
+
+    // if (!CILK_SETJMP(sf.ctx))
+    Value *C = EmitCilkSetJmp(B, SF, M);
+    C = B.CreateICmpEQ(C, ConstantInt::get(C->getType(), 0));
+    B.CreateCondBr(C, SyncCall, Excepting);
+  }
+
+  // SyncCall
+  {
+    IRBuilder<> B(SyncCall);
+
+    // __cilkrts_sync(&sf);
+    B.CreateCall(CILKRTS_FUNC(sync, M), SF);
+    B.CreateBr(Exit);
+  }
+
+  // Excepting
+  {
+    IRBuilder<> B(Excepting);
+    if (Rethrow) {
+      Value *Flags = LoadField(B, SF, StackFrameBuilder::flags,
+                               /*isVolatile=*/true);
+      Flags = B.CreateAnd(Flags,
+                          ConstantInt::get(Flags->getType(),
+                                           CILK_FRAME_EXCEPTING));
+      Value *Zero = ConstantInt::get(Flags->getType(), 0);
+      Value *CanExcept = B.CreateICmpEQ(Flags, Zero);
+      B.CreateCondBr(CanExcept, Exit, Rethrow);
+    } else {
+      B.CreateBr(Exit);
+    }
+  }
+
+  // Rethrow
+  if (Rethrow) {
+    IRBuilder<> B(Rethrow);
+    B.CreateCall(CILKRTS_FUNC(rethrow, M), SF)->setDoesNotReturn();
+    B.CreateUnreachable();
+  }
+
+  // Exit
+  {
+    IRBuilder<> B(Exit);
+
+    // ++sf.worker->pedigree.rank;
+    Value *Rank = LoadField(B, SF, StackFrameBuilder::worker,
+                            /*isVolatile=*/true);
+    Rank = GEP(B, Rank, WorkerBuilder::pedigree);
+    Rank = GEP(B, Rank, PedigreeBuilder::rank);
+    B.CreateStore(B.CreateAdd(
+                      B.CreateLoad(Rank),
+                      ConstantInt::get(Rank->getType()->getPointerElementType(),
+                                       1)),
+                  Rank);
+    if (instrument)
+      // cilk_sync_end
+      B.CreateCall(CILK_CSI_FUNC(sync_end, M), SF);
+
+    B.CreateRetVoid();
+  }
+
+  Fn->addFnAttr(Attribute::AlwaysInline);
+  Fn->addFnAttr(Attribute::ReturnsTwice);
+  return Fn;
+}
+
+/// \brief Get or create a LLVM function for __cilkrts_enter_frame.
+/// It is equivalent to the following C code
+///
+/// void __cilkrts_enter_frame_1(struct __cilkrts_stack_frame *sf)
+/// {
+///     struct __cilkrts_worker *w = __cilkrts_get_tls_worker();
+///     if (w == 0) { /* slow path, rare */
+///         w = __cilkrts_bind_thread_1();
+///         sf->flags = CILK_FRAME_LAST | CILK_FRAME_VERSION;
+///     } else {
+///         sf->flags = CILK_FRAME_VERSION;
+///     }
+///     sf->call_parent = w->current_stack_frame;
+///     sf->worker = w;
+///     /* sf->except_data is only valid when CILK_FRAME_EXCEPTING is set */
+///     w->current_stack_frame = sf;
+/// }
+static Function *Get__cilkrts_enter_frame_1(Module &M) {
+  Function *Fn = nullptr;
+
+  if (GetOrCreateFunction<cilk_func>("__cilkrts_enter_frame_1", M, Fn))
+    return Fn;
+
+  LLVMContext &Ctx = M.getContext();
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn);
+  BasicBlock *SlowPath = BasicBlock::Create(Ctx, "slowpath", Fn);
+  BasicBlock *FastPath = BasicBlock::Create(Ctx, "fastpath", Fn);
+  BasicBlock *Cont = BasicBlock::Create(Ctx, "cont", Fn);
+
+  llvm::PointerType *WorkerPtrTy =
+    TypeBuilder<__cilkrts_worker*, false>::get(Ctx);
+  StructType *SFTy = StackFrameBuilder::get(Ctx);
+
+  // Block  (Entry)
+  CallInst *W = nullptr;
+  {
+    IRBuilder<> B(Entry);
+    if (fastCilk)
+      W = B.CreateCall(CILKRTS_FUNC(get_tls_worker_fast, M));
+    else
+      W = B.CreateCall(CILKRTS_FUNC(get_tls_worker, M));
+
+    Value *Cond = B.CreateICmpEQ(W, ConstantPointerNull::get(WorkerPtrTy));
+    B.CreateCondBr(Cond, SlowPath, FastPath);
+  }
+  // Block  (SlowPath)
+  CallInst *Wslow = nullptr;
+  {
+    IRBuilder<> B(SlowPath);
+    Wslow = B.CreateCall(CILKRTS_FUNC(bind_thread_1, M));
+    llvm::Type *Ty = SFTy->getElementType(StackFrameBuilder::flags);
+    StoreField(B,
+               ConstantInt::get(Ty, CILK_FRAME_LAST | CILK_FRAME_VERSION),
+               SF, StackFrameBuilder::flags, /*isVolatile=*/true);
+    B.CreateBr(Cont);
+  }
+  // Block  (FastPath)
+  {
+    IRBuilder<> B(FastPath);
+    llvm::Type *Ty = SFTy->getElementType(StackFrameBuilder::flags);
+    StoreField(B,
+               ConstantInt::get(Ty, CILK_FRAME_VERSION),
+               SF, StackFrameBuilder::flags, /*isVolatile=*/true);
+    B.CreateBr(Cont);
+  }
+  // Block  (Cont)
+  {
+    IRBuilder<> B(Cont);
+    Value *Wfast = W;
+    PHINode *W  = B.CreatePHI(WorkerPtrTy, 2);
+    W->addIncoming(Wslow, SlowPath);
+    W->addIncoming(Wfast, FastPath);
+
+    StoreField(B,
+               LoadField(B, W, WorkerBuilder::current_stack_frame,
+                         /*isVolatile=*/true),
+               SF, StackFrameBuilder::call_parent,
+               /*isVolatile=*/true);
+
+    StoreField(B, W, SF, StackFrameBuilder::worker, /*isVolatile=*/true);
+    StoreField(B, SF, W, WorkerBuilder::current_stack_frame,
+               /*isVolatile=*/true);
+
+    B.CreateRetVoid();
+  }
+
+  Fn->addFnAttr(Attribute::InlineHint);
+
+  return Fn;
+}
+
+/// \brief Get or create a LLVM function for __cilkrts_enter_frame_fast.
+/// It is equivalent to the following C code
+///
+/// void __cilkrts_enter_frame_fast_1(struct __cilkrts_stack_frame *sf)
+/// {
+///     struct __cilkrts_worker *w = __cilkrts_get_tls_worker();
+///     sf->flags = CILK_FRAME_VERSION;
+///     sf->call_parent = w->current_stack_frame;
+///     sf->worker = w;
+///     /* sf->except_data is only valid when CILK_FRAME_EXCEPTING is set */
+///     w->current_stack_frame = sf;
+/// }
+static Function *Get__cilkrts_enter_frame_fast_1(Module &M) {
+  Function *Fn = nullptr;
+
+  if (GetOrCreateFunction<cilk_func>("__cilkrts_enter_frame_fast_1", M, Fn))
+    return Fn;
+
+  LLVMContext &Ctx = M.getContext();
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn);
+
+  IRBuilder<> B(Entry);
+  Value *W;
+
+  if (fastCilk)
+    W = B.CreateCall(CILKRTS_FUNC(get_tls_worker_fast, M));
+  else
+    W = B.CreateCall(CILKRTS_FUNC(get_tls_worker, M));
+
+  StructType *SFTy = StackFrameBuilder::get(Ctx);
+  llvm::Type *Ty = SFTy->getElementType(StackFrameBuilder::flags);
+
+  StoreField(B,
+             ConstantInt::get(Ty, CILK_FRAME_VERSION),
+             SF, StackFrameBuilder::flags, /*isVolatile=*/true);
+  StoreField(B,
+             LoadField(B, W, WorkerBuilder::current_stack_frame,
+                       /*isVolatile=*/true),
+             SF, StackFrameBuilder::call_parent,
+             /*isVolatile=*/true);
+  StoreField(B, W, SF, StackFrameBuilder::worker, /*isVolatile=*/true);
+  StoreField(B, SF, W, WorkerBuilder::current_stack_frame, /*isVolatile=*/true);
+
+  B.CreateRetVoid();
+
+  Fn->addFnAttr(Attribute::InlineHint);
+
+  return Fn;
+}
+
+// /// \brief Get or create a LLVM function for __cilk_parent_prologue.
+// /// It is equivalent to the following C code
+// ///
+// /// void __cilk_parent_prologue(__cilkrts_stack_frame *sf) {
+// ///   __cilkrts_enter_frame_1(sf);
+// /// }
+// static Function *GetCilkParentPrologue(Module &M) {
+//   Function *Fn = 0;
+
+//   if (GetOrCreateFunction<cilk_func>("__cilk_parent_prologue", M, Fn))
+//     return Fn;
+
+//   // If we get here we need to add the function body
+//   LLVMContext &Ctx = M.getContext();
+
+//   Function::arg_iterator args = Fn->arg_begin();
+//   Value *SF = &*args;
+
+//   BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn);
+//   IRBuilder<> B(Entry);
+
+//   // __cilkrts_enter_frame_1(sf)
+//   B.CreateCall(CILKRTS_FUNC(enter_frame_1, M), SF);
+
+//   B.CreateRetVoid();
+
+//   Fn->addFnAttr(Attribute::InlineHint);
+
+//   return Fn;
+// }
+
+/// \brief Get or create a LLVM function for __cilk_parent_epilogue.
+/// It is equivalent to the following C code
+///
+/// void __cilk_parent_epilogue(__cilkrts_stack_frame *sf) {
+///   __cilkrts_pop_frame(sf);
+///   if (sf->flags != CILK_FRAME_VERSION)
+///     __cilkrts_leave_frame(sf);
+/// }
+static Function *GetCilkParentEpilogue(Module &M, bool instrument = false) {
+  Function *Fn = nullptr;
+
+  if (GetOrCreateFunction<cilk_func>("__cilk_parent_epilogue", M, Fn))
+    return Fn;
+
+  // If we get here we need to add the function body
+  LLVMContext &Ctx = M.getContext();
+
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn),
+    *B1 = BasicBlock::Create(Ctx, "body", Fn),
+    *Exit  = BasicBlock::Create(Ctx, "exit", Fn);
+
+  // Entry
+  {
+    IRBuilder<> B(Entry);
+
+    if (instrument)
+      // cilk_leave_begin
+      B.CreateCall(CILK_CSI_FUNC(leave_begin, M), SF);
+
+    // __cilkrts_pop_frame(sf)
+    B.CreateCall(CILKRTS_FUNC(pop_frame, M), SF);
+
+    // if (sf->flags != CILK_FRAME_VERSION)
+    Value *Flags = LoadField(B, SF, StackFrameBuilder::flags,
+                             /*isVolatile=*/true);
+    Value *Cond = B.CreateICmpNE(Flags,
+                                 ConstantInt::get(Flags->getType(),
+                                                  CILK_FRAME_VERSION));
+    B.CreateCondBr(Cond, B1, Exit);
+  }
+
+  // B1
+  {
+    IRBuilder<> B(B1);
+
+    // __cilkrts_leave_frame(sf);
+    B.CreateCall(CILKRTS_FUNC(leave_frame, M), SF);
+    B.CreateBr(Exit);
+  }
+
+  // Exit
+  {
+    IRBuilder<> B(Exit);
+    if (instrument)
+      // cilk_leave_end
+      B.CreateCall(CILK_CSI_FUNC(leave_end, M));
+    B.CreateRetVoid();
+  }
+
+  Fn->addFnAttr(Attribute::InlineHint);
+
+  return Fn;
+}
+
+static const StringRef stack_frame_name = "__cilkrts_sf";
+static const StringRef worker8_name = "__cilkrts_wc8";
+
+// static llvm::Value *LookupStackFrame(Function &F) {
+//   return F.getValueSymbolTable()->lookup(stack_frame_name);
+// }
+
+/// \brief Create the __cilkrts_stack_frame for the spawning function.
+static AllocaInst *CreateStackFrame(Function &F) {
+  // assert(!LookupStackFrame(F) && "already created the stack frame");
+
+  LLVMContext &Ctx = F.getContext();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  Type *SFTy = StackFrameBuilder::get(Ctx);
+
+  Instruction *I = F.getEntryBlock().getFirstNonPHIOrDbgOrLifetime();
+
+  AllocaInst *SF = new AllocaInst(SFTy, DL.getAllocaAddrSpace(),
+                                  /*size*/nullptr, 8,
+                                  /*name*/stack_frame_name, /*insert before*/I);
+  if (!I)
+    F.getEntryBlock().getInstList().push_back(SF);
+
+  return SF;
+}
+
+Value* GetOrInitCilkStackFrame(Function& F,
+                               ValueToValueMapTy &DetachCtxToStackFrame,
+                               bool Helper = true, bool instrument = false) {
+  // Value* V = LookupStackFrame(F);
+  Value *V = DetachCtxToStackFrame[&F];
+  if (V) return V;
+
+  AllocaInst* alloc = CreateStackFrame(F);
+  DetachCtxToStackFrame[&F] = alloc;
+  BasicBlock::iterator II = F.getEntryBlock().getFirstInsertionPt();
+  AllocaInst* curinst;
+  do {
+    curinst = dyn_cast<llvm::AllocaInst>(II);
+    II++;
+  } while (curinst != alloc);
+  Value *StackSave;
+  IRBuilder<> IRB(&(F.getEntryBlock()), II);
+
+  if (instrument) {
+    Type *Int8PtrTy = IRB.getInt8PtrTy();
+    Value *ThisFn = ConstantExpr::getBitCast(&F, Int8PtrTy);
+    Value *ReturnAddress =
+      IRB.CreateCall(Intrinsic::getDeclaration(F.getParent(),
+                                               Intrinsic::returnaddress),
+                     IRB.getInt32(0));
+    StackSave =
+      IRB.CreateCall(Intrinsic::getDeclaration(F.getParent(),
+                                               Intrinsic::stacksave));
+    if (Helper) {
+      Value *begin_args[3] = { alloc, ThisFn, ReturnAddress };
+      IRB.CreateCall(CILK_CSI_FUNC(enter_helper_begin, *F.getParent()),
+                     begin_args);
+    } else {
+      Value *begin_args[4] = { IRB.getInt32(0), alloc, ThisFn, ReturnAddress };
+      IRB.CreateCall(CILK_CSI_FUNC(enter_begin, *F.getParent()), begin_args);
+    }
+  }
+  Value *args[1] = { alloc };
+  if (Helper)
+    IRB.CreateCall(CILKRTS_FUNC(enter_frame_fast_1, *F.getParent()), args);
+  else
+    IRB.CreateCall(CILKRTS_FUNC(enter_frame_1, *F.getParent()), args);
+  /* inst->insertAfter(alloc); */
+
+  if (instrument) {
+    Value* end_args[2] = { alloc, StackSave };
+    IRB.CreateCall(CILK_CSI_FUNC(enter_end, *F.getParent()), end_args);
+  }
+
+  EscapeEnumerator EE(F, "cilkabi_epilogue", false);
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    if (isa<ReturnInst>(AtExit->GetInsertPoint()))
+      AtExit->CreateCall(GetCilkParentEpilogue(*F.getParent(), instrument),
+                         args, "");
+  }
+
+  // // The function exits are unified before lowering.
+  // ReturnInst *retInst = nullptr;
+  // for (BasicBlock &BB : F) {
+  //   TerminatorInst* TI = BB.getTerminator();
+  //   if (!TI) continue;
+  //   if (ReturnInst* RI = llvm::dyn_cast<ReturnInst>(TI)) {
+  //     assert(!retInst && "Multiple returns found.");
+  //     retInst = RI;
+  //   }
+  // }
+
+  // assert(retInst && "No returns found.");
+  // CallInst::Create(GetCilkParentEpilogue(*F.getParent(), instrument), args, "",
+  //                  retInst);
+  return alloc;
+}
+
+static inline
+bool makeFunctionDetachable(Function &extracted,
+                            ValueToValueMapTy &DetachCtxToStackFrame,
+                            bool instrument = false) {
+  Module *M = extracted.getParent();
+  // LLVMContext& Context = extracted.getContext();
+  // const DataLayout& DL = M->getDataLayout();
+  /*
+    __cilkrts_stack_frame sf;
+    __cilkrts_enter_frame_fast_1(&sf);
+    __cilkrts_detach();
+    *x = f(y);
+  */
+
+  Value *SF = CreateStackFrame(extracted);
+  DetachCtxToStackFrame[&extracted] = SF;
+  assert(SF);
+  Value *args[1] = { SF };
+
+  // Scan function to see if it detaches.
+  bool SimpleHelper = true;
+  for (BasicBlock &BB : extracted) {
+    if (isa<DetachInst>(BB.getTerminator())) {
+      SimpleHelper = false;
+      break;
+    }
+  }
+  if (!SimpleHelper)
+    DEBUG(dbgs() << "Detachable helper function itself detaches.\n");
+
+  BasicBlock::iterator II = extracted.getEntryBlock().getFirstInsertionPt();
+  AllocaInst* curinst;
+  do {
+    curinst = dyn_cast<llvm::AllocaInst>(II);
+    II++;
+  } while (curinst != SF);
+  Value *StackSave;
+  IRBuilder<> IRB(&(extracted.getEntryBlock()), II);
+
+  if (instrument) {
+    Type *Int8PtrTy = IRB.getInt8PtrTy();
+    Value *ThisFn = ConstantExpr::getBitCast(&extracted, Int8PtrTy);
+    Value *ReturnAddress =
+      IRB.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::returnaddress),
+                     IRB.getInt32(0));
+    StackSave =
+      IRB.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave));
+    if (SimpleHelper) {
+      Value *begin_args[3] = { SF, ThisFn, ReturnAddress };
+      IRB.CreateCall(CILK_CSI_FUNC(enter_helper_begin, *M), begin_args);
+    } else {
+      Value *begin_args[4] = { IRB.getInt32(0), SF, ThisFn, ReturnAddress };
+      IRB.CreateCall(CILK_CSI_FUNC(enter_begin, *M), begin_args);
+    }
+  }
+
+  if (SimpleHelper)
+    IRB.CreateCall(CILKRTS_FUNC(enter_frame_fast_1, *M), args);
+  else
+    IRB.CreateCall(CILKRTS_FUNC(enter_frame_1, *M), args);
+
+  if (instrument) {
+    Value *end_args[2] = { SF, StackSave };
+    IRB.CreateCall(CILK_CSI_FUNC(enter_end, *M), end_args);
+  }
+
+  // Call __cilkrts_detach
+  {
+    if (instrument)
+      IRB.CreateCall(CILK_CSI_FUNC(detach_begin, *M), args);
+
+    IRB.CreateCall(CILKRTS_FUNC(detach, *M), args);
+
+    if (instrument)
+      IRB.CreateCall(CILK_CSI_FUNC(detach_end, *M));
+  }
+
+  EscapeEnumerator EE(extracted, "cilkabi_epilogue", false);
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    if (isa<ReturnInst>(AtExit->GetInsertPoint()))
+      AtExit->CreateCall(GetCilkParentEpilogue(*M, instrument), args, "");
+    else if (ResumeInst *RI = dyn_cast<ResumeInst>(AtExit->GetInsertPoint())) {
+      /*
+        sf.flags = sf.flags | CILK_FRAME_EXCEPTING;
+        sf.except_data = Exn;
+      */
+      IRBuilder<> B(RI);
+      Value *Exn = AtExit->CreateExtractValue(RI->getValue(),
+                                              ArrayRef<unsigned>(0));
+      Value *Flags = LoadField(*AtExit, SF, StackFrameBuilder::flags,
+                               /*isVolatile=*/true);
+      Flags = AtExit->CreateOr(Flags,
+                               ConstantInt::get(Flags->getType(),
+                                                CILK_FRAME_EXCEPTING));
+      StoreField(*AtExit, Exn, SF, StackFrameBuilder::except_data);
+      /*
+        __cilkrts_pop_frame(&sf);
+        if (sf->flags)
+          __cilkrts_leave_frame(&sf);
+      */
+      AtExit->CreateCall(GetCilkParentEpilogue(*M, instrument), args, "");
+      // CallInst::Create(GetCilkParentEpilogue(*M, instrument), args, "", RI);
+    }
+  }
+
+  // // Handle returns
+  // ReturnInst* Ret = nullptr;
+  // for (BasicBlock &BB : extracted) {
+  //   TerminatorInst* TI = BB.getTerminator();
+  //   if (!TI) continue;
+  //   if (ReturnInst* RI = dyn_cast<ReturnInst>(TI)) {
+  //     assert(Ret == nullptr && "Multiple return");
+  //     Ret = RI;
+  //   }
+  // }
+  // assert(Ret && "No return from extract function");
+
+  // /*
+  //    __cilkrts_pop_frame(&sf);
+  //    if (sf->flags)
+  //      __cilkrts_leave_frame(&sf);
+  // */
+  // CallInst::Create(GetCilkParentEpilogue(*M, instrument), args, "", Ret);
+
+  // // Handle resumes
+  // for (BasicBlock &BB : extracted) {
+  //   if (!isa<ResumeInst>(BB.getTerminator()))
+  //     continue;
+  //   ResumeInst *RI = cast<ResumeInst>(BB.getTerminator());
+  //   /*
+  //     sf.flags = sf.flags | CILK_FRAME_EXCEPTING;
+  //     sf.except_data = Exn;
+  //    */
+  //   IRBuilder<> B(RI);
+  //   Value *Exn = B.CreateExtractValue(RI->getValue(), ArrayRef<unsigned>(0));
+  //   Value *Flags = LoadField(B, SF, StackFrameBuilder::flags,
+  //                            /*isVolatile=*/true);
+  //   Flags = B.CreateOr(Flags,
+  //                      ConstantInt::get(Flags->getType(),
+  //                                       CILK_FRAME_EXCEPTING));
+  //   StoreField(B, Exn, SF, StackFrameBuilder::except_data);
+  //   /*
+  //     __cilkrts_pop_frame(&sf);
+  //     if (sf->flags)
+  //       __cilkrts_leave_frame(&sf);
+  //   */
+  //   CallInst::Create(GetCilkParentEpilogue(*M, instrument), args, "", RI);
+  // }
+
+  return true;
+}
+
+//##############################################################################
+
+/// \brief Get/Create the worker count for the spawning function.
+Value* llvm::cilk::GetOrCreateWorker8(Function &F) {
+  // Value* W8 = F.getValueSymbolTable()->lookup(worker8_name);
+  // if (W8) return W8;
+  IRBuilder<> B(F.getEntryBlock().getFirstNonPHIOrDbgOrLifetime());
+  Value *P0 = B.CreateCall(CILKRTS_FUNC(get_nworkers, *F.getParent()));
+  Value *P8 = B.CreateMul(P0, ConstantInt::get(P0->getType(), 8), worker8_name);
+  return P8;
+}
+
+void llvm::cilk::createSync(SyncInst &SI, ValueToValueMapTy &DetachCtxToStackFrame,
+                            bool instrument) {
+  Function &Fn = *(SI.getParent()->getParent());
+  Module &M = *(Fn.getParent());
+
+  Value *SF = GetOrInitCilkStackFrame(Fn, DetachCtxToStackFrame,
+                                      /*isFast*/false, instrument);
+  Value *args[] = { SF };
+  assert( args[0] && "sync used in function without frame!" );
+  CallInst *CI = CallInst::Create(GetCilkSyncFn(M, instrument), args, "",
+                                  /*insert before*/&SI);
+  CI->setDebugLoc(SI.getDebugLoc());
+  BasicBlock *Succ = SI.getSuccessor(0);
+  SI.eraseFromParent();
+  BranchInst::Create(Succ, CI->getParent());
+}
+
+bool llvm::cilk::verifyDetachedCFG(const DetachInst &Detach, DominatorTree &DT,
+                                   bool error) {
+  BasicBlock *Spawned  = Detach.getDetached();
+  BasicBlock *Continue = Detach.getContinue();
+  BasicBlockEdge DetachEdge(Detach.getParent(), Spawned);
+
+  SmallVector<BasicBlock *, 32> Todo;
+  SmallPtrSet<BasicBlock *, 32> functionPieces;
+  SmallVector<BasicBlock *, 4> WorkListEH;
+  Todo.push_back(Spawned);
+
+  while (!Todo.empty()) {
+    BasicBlock *BB = Todo.pop_back_val();
+
+    if (!functionPieces.insert(BB).second)
+      continue;
+
+    TerminatorInst* Term = BB->getTerminator();
+    if (Term == nullptr) return false;
+    if (ReattachInst* Inst = dyn_cast<ReattachInst>(Term)) {
+      //only analyze reattaches going to the same continuation
+      if (Inst->getSuccessor(0) != Continue) continue;
+      continue;
+    } else if (DetachInst* Inst = dyn_cast<DetachInst>(Term)) {
+      assert(Inst != &Detach && "Found recursive Detach!");
+      Todo.push_back(Inst->getSuccessor(0));
+      Todo.push_back(Inst->getSuccessor(1));
+      continue;
+    } else if (SyncInst* Inst = dyn_cast<SyncInst>(Term)) {
+      //only sync inner elements, consider as branch
+      Todo.push_back(Inst->getSuccessor(0));
+      continue;
+    } else if (isa<BranchInst>(Term) || isa<SwitchInst>(Term) ||
+               isa<InvokeInst>(Term)) {
+      for (BasicBlock *Succ : successors(BB)) {
+        if (!DT.dominates(DetachEdge, Succ))
+          // We assume that this block is an exception-handling block and save
+          // it for later processing.
+          WorkListEH.push_back(Succ);
+        else
+          Todo.push_back(Succ);
+      }
+      continue;
+    } else if (isa<UnreachableInst>(Term) || isa<ResumeInst>(Term)) {
+      continue;
+    } else {
+      DEBUG(Term->dump());
+      DEBUG(Term->getParent()->getParent()->dump());
+      assert(!error && "Detached block did not absolutely terminate in reattach");
+      return false;
+    }
+  }
+  {
+    SmallPtrSet<BasicBlock *, 4> Visited;
+    while (!WorkListEH.empty()) {
+      BasicBlock *BB = WorkListEH.pop_back_val();
+      if (!Visited.insert(BB).second)
+        continue;
+
+      // Make sure that the control flow through these exception-handling blocks
+      // cannot re-enter the blocks being outlined.
+      assert(!functionPieces.count(BB) &&
+             "EH blocks for a detached region reenter that region.");
+
+      // Make sure that the control flow through these exception-handling blocks
+      // doesn't perform an ordinary return.
+      assert(!isa<ReturnInst>(BB->getTerminator()) &&
+             "EH block terminated by return.");
+
+      // Make sure that the control flow through these exception-handling blocks
+      // doesn't reattach to the detached CFG's continuation.
+      if (ReattachInst *RI = dyn_cast<ReattachInst>(BB->getTerminator()))
+        assert(RI->getSuccessor(0) != Continue &&
+               "Exit block reaches a reattach to the continuation.");
+
+      for (BasicBlock *Succ : successors(BB))
+        WorkListEH.push_back(Succ);
+    }
+  }
+  return true;
+}
+
+bool llvm::cilk::populateDetachedCFG(
+    const DetachInst &Detach, DominatorTree &DT,
+    SmallPtrSetImpl<BasicBlock *> &functionPieces,
+    SmallVectorImpl<BasicBlock *> &reattachB,
+    SmallPtrSetImpl<BasicBlock *> &ExitBlocks,
+    bool replace, bool error) {
+  SmallVector<BasicBlock *, 32> Todo;
+  SmallVector<BasicBlock *, 4> WorkListEH;
+
+  BasicBlock *Spawned  = Detach.getDetached();
+  BasicBlock *Continue = Detach.getContinue();
+  BasicBlockEdge DetachEdge(Detach.getParent(), Spawned);
+  Todo.push_back(Spawned);
+
+  while (!Todo.empty()) {
+    BasicBlock *BB = Todo.pop_back_val();
+
+    if (!functionPieces.insert(BB).second)
+      continue;
+
+    TerminatorInst *Term = BB->getTerminator();
+    if (Term == nullptr) return false;
+    if (isa<ReattachInst>(Term)) {
+      // only analyze reattaches going to the same continuation
+      if (Term->getSuccessor(0) != Continue) continue;
+      if (replace) {
+        BranchInst* toReplace = BranchInst::Create(Continue);
+        ReplaceInstWithInst(Term, toReplace);
+        reattachB.push_back(BB);
+      }
+      continue;
+    } else if (isa<DetachInst>(Term)) {
+      assert(Term != &Detach && "Found recursive detach!");
+      Todo.push_back(Term->getSuccessor(0));
+      Todo.push_back(Term->getSuccessor(1));
+      continue;
+    } else if (isa<SyncInst>(Term)) {
+      //only sync inner elements, consider as branch
+      Todo.push_back(Term->getSuccessor(0));
+      continue;
+    } else if (isa<BranchInst>(Term) || isa<SwitchInst>(Term) ||
+               isa<InvokeInst>(Term)) {
+      for (BasicBlock *Succ : successors(BB)) {
+        if (!DT.dominates(DetachEdge, Succ)) {
+          // We assume that this block is an exception-handling block and save
+          // it for later processing.
+          ExitBlocks.insert(Succ);
+          WorkListEH.push_back(Succ);
+        } else {
+          Todo.push_back(Succ);
+        }
+      }
+      // We don't bother cloning unreachable exits from the detached CFG at this
+      // point.  We're cloning the entire detached CFG anyway when we outline
+      // the function.
+      continue;
+    } else if (isa<UnreachableInst>(Term) || isa<ResumeInst>(Term)) {
+      continue;
+    } else {
+      DEBUG(Term->dump());
+      DEBUG(Term->getParent()->getParent()->dump());
+      assert(!error && "Detached block did not absolutely terminate in reattach");
+      return false;
+    }
+  }
+
+  // Find the exit-handling blocks.
+  {
+    SmallPtrSet<BasicBlock *, 4> Visited;
+    while (!WorkListEH.empty()) {
+      BasicBlock *BB = WorkListEH.pop_back_val();
+      if (!Visited.insert(BB).second)
+        continue;
+
+      // Make sure that the control flow through these exception-handling blocks
+      // cannot re-enter the blocks being outlined.
+      assert(!functionPieces.count(BB) &&
+             "EH blocks for a detached region reenter that region.");
+
+      // Make sure that the control flow through these exception-handling blocks
+      // doesn't perform an ordinary return.
+      assert(!isa<ReturnInst>(BB->getTerminator()) &&
+             "EH block terminated by return.");
+
+      // Make sure that the control flow through these exception-handling blocks
+      // doesn't reattach to the detached CFG's continuation.
+      if (ReattachInst *RI = dyn_cast<ReattachInst>(BB->getTerminator()))
+        assert(RI->getSuccessor(0) != Continue &&
+               "Exit block reaches a reattach to the continuation.");
+
+      // if (isa<ResumeInst>(BB-getTerminator()))
+      //   ResumeBlocks.push_back(BB);
+
+      for (BasicBlock *Succ : successors(BB)) {
+        ExitBlocks.insert(Succ);
+        WorkListEH.push_back(Succ);
+      }
+    }
+
+    // Visited now contains exception-handling blocks that we want to clone as
+    // part of outlining.
+    for (BasicBlock *EHBlock : Visited)
+      functionPieces.insert(EHBlock);
+  }
+
+  return true;
+}
+
+//Returns true if success
+Function *llvm::cilk::extractDetachBodyToFunction(DetachInst &detach,
+                                                  DominatorTree &DT,
+                                                  AssumptionCache &AC,
+                                                  CallInst **call) {
+  BasicBlock *Detacher = detach.getParent();
+  Function &F = *(Detacher->getParent());
+
+  BasicBlock *Spawned  = detach.getDetached();
+  BasicBlock *Continue = detach.getContinue();
+
+  SmallPtrSet<BasicBlock *, 32> functionPieces;
+  SmallVector<BasicBlock *, 32> reattachB;
+  SmallPtrSet<BasicBlock *, 4> ExitBlocks;
+
+  // if (!Spawned->getUniquePredecessor())
+  //   dbgs() << *Spawned;
+  assert(Spawned->getUniquePredecessor() &&
+         "Entry block of detached CFG has multiple predecessors.");
+  assert(Spawned->getUniquePredecessor() == Detacher &&
+         "Broken CFG.");
+
+  // if (getNumPred(Spawned) > 1) {
+  //   dbgs() << "Found multiple predecessors to a detached-CFG entry block "
+  //          << Spawned->getName() << ".\n";
+  //   BasicBlock* ts = BasicBlock::Create(Spawned->getContext(), Spawned->getName()+".fx", &F, Detacher);
+  //   IRBuilder<> b(ts);
+  //   b.CreateBr(Spawned);
+  //   detach.setSuccessor(0,ts);
+  //   llvm::BasicBlock::iterator i = Spawned->begin();
+  //   while (auto phi = llvm::dyn_cast<llvm::PHINode>(i)) {
+  //     int idx = phi->getBasicBlockIndex(detach.getParent());
+  //     phi->setIncomingBlock(idx, ts);
+  //     ++i;
+  //   }
+  //   Spawned = ts;
+  // }
+
+  if (!populateDetachedCFG(detach, DT, functionPieces, reattachB,
+                           ExitBlocks, true))
+    return nullptr;
+
+  // functionPieces.erase(Spawned);
+  // std::vector<BasicBlock *> blocks(functionPieces.begin(), functionPieces.end());
+  // blocks.insert(blocks.begin(), Spawned);
+  // functionPieces.insert(Spawned);
+
+  // Check the spawned block's predecessors.
+  for (BasicBlock *BB : functionPieces) {
+    int detached_count = 0;
+    if (ExitBlocks.count(BB))
+      continue;
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *Pred = *PI;
+      if (detached_count == 0 && BB == Spawned && Pred == detach.getParent()) {
+        detached_count = 1;
+        continue;
+      }
+      assert(functionPieces.count(Pred) &&
+             "Block inside of detached context branched into from outside branch context");
+    }
+  }
+
+  // Get the inputs and outputs for the detached CFG.
+  SetVector<Value *> Inputs, Outputs;
+  findInputsOutputs(functionPieces, Inputs, Outputs, &ExitBlocks);
+  // extractor.findInputsOutputs(Inputs, Outputs);
+  assert(Outputs.empty() &&
+         "All results from detached CFG should be passed by memory already.");
+
+  // Clone the detached CFG into a helper function.
+  ValueToValueMapTy VMap;
+  Function *extracted;
+  {
+    SmallVector<ReturnInst *, 4> Returns;  // Ignore returns cloned.
+    std::vector<BasicBlock *> blocks(functionPieces.begin(), functionPieces.end());
+
+    extracted = CreateHelper(Inputs, Outputs, blocks,
+                             Spawned, Detacher, Continue,
+                             VMap, F.getParent(),
+                             F.getSubprogram() != nullptr, Returns, ".cilk",
+                             &ExitBlocks, nullptr, nullptr, nullptr, nullptr);
+
+    assert(Returns.empty() && "Returns cloned when cloning detached CFG.");
+
+    // Use a fast calling convention for the helper.
+    extracted->setCallingConv(CallingConv::Fast);
+    // extracted->setCallingConv(F.getCallingConv());
+
+    extracted->addFnAttr(Attribute::NoInline);
+  }
+
+  // Add alignment assumptions to arguments of helper, based on alignment of
+  // values in old function.
+  AddAlignmentAssumptions(&F, Inputs, VMap, &detach, &AC, &DT);
+
+  // Add call to new helper function in original function.
+  CallInst *TopCall;
+  {
+    // Create call instruction.
+    IRBuilder<> Builder(&detach);
+    TopCall = Builder.CreateCall(extracted, Inputs.getArrayRef());
+    // Use a fast calling convention for the helper.
+    TopCall->setCallingConv(CallingConv::Fast);
+    // TopCall->setCallingConv(extracted->getCallingConv());
+    TopCall->setDebugLoc(detach.getDebugLoc());
+  }
+  if (call)
+    *call = TopCall;
+
+  // Move allocas in the newly cloned detached CFG to the entry block of the
+  // helper.
+  {
+    // Collect reattach instructions.
+    SmallVector<Instruction *, 4> ReattachPoints;
+    for (pred_iterator PI = pred_begin(Continue), PE = pred_end(Continue);
+         PI != PE; ++PI) {
+      BasicBlock *Pred = *PI;
+      if (!isa<ReattachInst>(Pred->getTerminator())) continue;
+      if (functionPieces.count(Pred))
+        ReattachPoints.push_back(cast<BasicBlock>(VMap[Pred])->getTerminator());
+    }
+
+    // Move allocas in cloned detached block to entry of helper function.
+    BasicBlock *ClonedDetachedBlock = cast<BasicBlock>(VMap[Spawned]);
+    MoveStaticAllocasInBlock(&extracted->getEntryBlock(), ClonedDetachedBlock,
+                             ReattachPoints);
+
+    // We should not need to add new llvm.stacksave/llvm.stackrestore
+    // intrinsics, because calling and returning from the helper will
+    // automatically manage the stack.
+  }
+
+  return extracted;
+}
+
+Function *llvm::cilk::createDetach(DetachInst &detach,
+                                   ValueToValueMapTy &DetachCtxToStackFrame,
+                                   DominatorTree &DT, AssumptionCache &AC,
+                                   bool instrument) {
+  BasicBlock *detB = detach.getParent();
+  Function &F = *(detB->getParent());
+
+  BasicBlock *Spawned  = detach.getDetached();
+  BasicBlock *Continue = detach.getContinue();
+
+  Module *M = F.getParent();
+  //replace with branch to succesor
+  //entry / cilk.spawn.savestate
+  Value *SF = GetOrInitCilkStackFrame(F, DetachCtxToStackFrame,
+                                      /*isFast=*/false, instrument);
+  // assert(SF && "null stack frame unexpected");
+
+  // dbgs() << *detB << *Spawned << *Continue;
+
+  // if (!Spawned->getUniquePredecessor())
+  //   SplitEdge(detB, Spawned, &DT, nullptr);
+
+  // dbgs() << *detB << *(detach.getDetached());
+
+  CallInst *cal = nullptr;
+  Function *extracted = extractDetachBodyToFunction(detach, DT, AC, &cal);
+  assert(extracted && "could not extract detach body to function");
+
+  // Unlink the detached CFG in the original function.  The heavy lifting of
+  // removing the outlined detached-CFG is left to subsequent DCE.
+  BranchInst *ContinueBr;
+  {
+    // Replace the detach with a branch to the continuation.
+    ContinueBr = BranchInst::Create(Continue);
+    ReplaceInstWithInst(&detach, ContinueBr);
+
+    // Rewrite phis in the detached block.
+    BasicBlock::iterator BI = Spawned->begin();
+    while (PHINode *P = dyn_cast<PHINode>(BI)) {
+      // int j = P->getBasicBlockIndex(detB);
+      // assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
+      P->removeIncomingValue(detB);
+      ++BI;
+    }
+  }
+
+  Value *SetJmpRes;
+  {
+    IRBuilder<> B(cal);
+
+    if (instrument)
+      // cilk_spawn_prepare
+      B.CreateCall(CILK_CSI_FUNC(spawn_prepare, *M), SF);
+
+    // Need to save state before spawning
+    SetJmpRes = EmitCilkSetJmp(B, SF, *M);
+
+    if (instrument)
+      // cilk_spawn_or_continue
+      B.CreateCall(CILK_CSI_FUNC(spawn_or_continue, *M), SetJmpRes);
+  }
+
+  // Conditionally call the new helper function based on the result of the
+  // setjmp.
+  {
+    BasicBlock *CallBlock = SplitBlock(detB, cal, &DT);
+    BasicBlock *CallCont = SplitBlock(CallBlock,
+                                      CallBlock->getTerminator(), &DT);
+    IRBuilder<> B(detB->getTerminator());
+    SetJmpRes = B.CreateICmpEQ(SetJmpRes,
+                               ConstantInt::get(SetJmpRes->getType(), 0));
+    B.CreateCondBr(SetJmpRes, CallBlock, CallCont);
+    detB->getTerminator()->eraseFromParent();
+  }
+
+  makeFunctionDetachable(*extracted, DetachCtxToStackFrame, instrument);
+
+  return extracted;
+}
diff --git a/llvm/lib/Transforms/Tapir/LLVMBuild.txt b/llvm/lib/Transforms/Tapir/LLVMBuild.txt
new file mode 100644
index 00000000000000..9b7ec2935c92fc
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/Tapir/LLVMBuild.txt ---------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = TapirOpts
+parent = Transforms
+required_libraries = Analysis Core Scalar Support TransformUtils
diff --git a/llvm/lib/Transforms/Tapir/LoopSpawning.cpp b/llvm/lib/Transforms/Tapir/LoopSpawning.cpp
new file mode 100644
index 00000000000000..a62e445eecf277
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/LoopSpawning.cpp
@@ -0,0 +1,2413 @@
+//===- LoopSpawning.cpp - Spawn loop iterations efficiently ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Modify Tapir loops to spawn their iterations efficiently.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Tapir/LoopSpawning.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Scalar/LoopDeletion.h"
+#include "llvm/Transforms/Tapir.h"
+#include "llvm/Transforms/Tapir/CilkABI.h"
+#include "llvm/Transforms/Tapir/Outline.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <utility>
+
+using std::make_pair;
+
+using namespace llvm;
+
+#define LS_NAME "loop-spawning"
+#define DEBUG_TYPE LS_NAME
+
+STATISTIC(LoopsAnalyzed, "Number of Tapir loops analyzed");
+STATISTIC(LoopsConvertedToDAC,
+          "Number of Tapir loops converted to divide-and-conquer iteration spawning");
+STATISTIC(LoopsConvertedToCilkABI,
+          "Number of Tapir loops converted to use the Cilk ABI for loops");
+
+namespace {
+// Forward declarations.
+class LoopSpawningHints;
+
+// /// \brief This modifies LoopAccessReport to initialize message with
+// /// tapir-loop-specific part.
+// class LoopSpawningReport : public LoopAccessReport {
+// public:
+//   LoopSpawningReport(Instruction *I = nullptr)
+//       : LoopAccessReport("loop-spawning: ", I) {}
+
+//   /// \brief This allows promotion of the loop-access analysis report into the
+//   /// loop-spawning report.  It modifies the message to add the
+//   /// loop-spawning-specific part of the message.
+//   explicit LoopSpawningReport(const LoopAccessReport &R)
+//       : LoopAccessReport(Twine("loop-spawning: ") + R.str(),
+//                          R.getInstr()) {}
+// };
+
+
+/// Utility class for getting and setting loop spawning hints in the form
+/// of loop metadata.
+/// This class keeps a number of loop annotations locally (as member variables)
+/// and can, upon request, write them back as metadata on the loop. It will
+/// initially scan the loop for existing metadata, and will update the local
+/// values based on information in the loop.
+class LoopSpawningHints {
+  enum HintKind { HK_STRATEGY };
+
+  /// Hint - associates name and validation with the hint value.
+  struct Hint {
+    const char *Name;
+    unsigned Value; // This may have to change for non-numeric values.
+    HintKind Kind;
+
+    Hint(const char *Name, unsigned Value, HintKind Kind)
+        : Name(Name), Value(Value), Kind(Kind) {}
+
+    bool validate(unsigned Val) {
+      switch (Kind) {
+      case HK_STRATEGY:
+        return (Val < ST_END);
+      }
+      return false;
+    }
+  };
+
+  /// Spawning strategy
+  Hint Strategy;
+
+  /// Return the loop metadata prefix.
+  static StringRef Prefix() { return "tapir.loop."; }
+
+public:
+  enum SpawningStrategy {
+    ST_SEQ,
+    ST_DAC,
+    ST_END,
+  };
+
+  static std::string printStrategy(enum SpawningStrategy Strat) {
+    switch(Strat) {
+    case LoopSpawningHints::ST_SEQ:
+      return "Spawn iterations sequentially";
+    case LoopSpawningHints::ST_DAC:
+      return "Use divide-and-conquer";
+    case LoopSpawningHints::ST_END:
+    default:
+      return "Unknown";
+    }
+  }
+
+  LoopSpawningHints(const Loop *L, OptimizationRemarkEmitter &ORE)
+      : Strategy("spawn.strategy", ST_SEQ, HK_STRATEGY),
+        TheLoop(L), ORE(ORE) {
+    // Populate values with existing loop metadata.
+    getHintsFromMetadata();
+  }
+
+  // /// Dumps all the hint information.
+  // std::string emitRemark() const {
+  //   LoopSpawningReport R;
+  //   R << "Strategy = " << printStrategy(getStrategy());
+
+  //   return R.str();
+  // }
+
+  enum SpawningStrategy getStrategy() const {
+    return (SpawningStrategy)Strategy.Value;
+  }
+
+private:
+  /// Find hints specified in the loop metadata and update local values.
+  void getHintsFromMetadata() {
+    MDNode *LoopID = TheLoop->getLoopID();
+    if (!LoopID)
+      return;
+
+    // First operand should refer to the loop id itself.
+    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      const MDString *S = nullptr;
+      SmallVector<Metadata *, 4> Args;
+
+      // The expected hint is either a MDString or a MDNode with the first
+      // operand a MDString.
+      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+        if (!MD || MD->getNumOperands() == 0)
+          continue;
+        S = dyn_cast<MDString>(MD->getOperand(0));
+        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+          Args.push_back(MD->getOperand(i));
+      } else {
+        S = dyn_cast<MDString>(LoopID->getOperand(i));
+        assert(Args.size() == 0 && "too many arguments for MDString");
+      }
+
+      if (!S)
+        continue;
+
+      // Check if the hint starts with the loop metadata prefix.
+      StringRef Name = S->getString();
+      if (Args.size() == 1)
+        setHint(Name, Args[0]);
+    }
+  }
+
+  /// Checks string hint with one operand and set value if valid.
+  void setHint(StringRef Name, Metadata *Arg) {
+    if (!Name.startswith(Prefix()))
+      return;
+    Name = Name.substr(Prefix().size(), StringRef::npos);
+
+    const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+    if (!C)
+      return;
+    unsigned Val = C->getZExtValue();
+
+    Hint *Hints[] = {&Strategy};
+    for (auto H : Hints) {
+      if (Name == H->Name) {
+        if (H->validate(Val))
+          H->Value = Val;
+        else
+          DEBUG(dbgs() << LS_NAME << " ignoring invalid hint '" <<
+                Name << "'\n");
+        break;
+      }
+    }
+  }
+
+  /// Create a new hint from name / value pair.
+  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
+    LLVMContext &Context = TheLoop->getHeader()->getContext();
+    Metadata *MDs[] = {MDString::get(Context, Name),
+                       ConstantAsMetadata::get(
+                           ConstantInt::get(Type::getInt32Ty(Context), V))};
+    return MDNode::get(Context, MDs);
+  }
+
+  /// Matches metadata with hint name.
+  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
+    MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
+    if (!Name)
+      return false;
+
+    for (auto H : HintTypes)
+      if (Name->getString().endswith(H.Name))
+        return true;
+    return false;
+  }
+
+  /// Sets current hints into loop metadata, keeping other values intact.
+  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+    if (HintTypes.size() == 0)
+      return;
+
+    // Reserve the first element to LoopID (see below).
+    SmallVector<Metadata *, 4> MDs(1);
+    // If the loop already has metadata, then ignore the existing operands.
+    MDNode *LoopID = TheLoop->getLoopID();
+    if (LoopID) {
+      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+        MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+        // If node in update list, ignore old value.
+        if (!matchesHintMetadataName(Node, HintTypes))
+          MDs.push_back(Node);
+      }
+    }
+
+    // Now, add the missing hints.
+    for (auto H : HintTypes)
+      MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+    // Replace current metadata node with new one.
+    LLVMContext &Context = TheLoop->getHeader()->getContext();
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+
+    TheLoop->setLoopID(NewLoopID);
+  }
+
+  /// The loop these hints belong to.
+  const Loop *TheLoop;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter &ORE;
+};
+
+// static void emitAnalysisDiag(const Loop *TheLoop,
+//                              OptimizationRemarkEmitter &ORE,
+//                              const LoopAccessReport &Message) {
+//   const char *Name = LS_NAME;
+//   LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE);
+// }
+
+static void emitMissedWarning(Function *F, Loop *L,
+                              const LoopSpawningHints &LH,
+                              OptimizationRemarkEmitter *ORE) {
+  // ORE->emit(OptimizationRemarkMissed(
+  //               LS_NAME, "LSHint", L->getStartLoc(), L->getHeader())
+  //           << "Strategy = "
+  //           << LoopSpawningHints::printStrategy(LH.getStrategy()));
+  switch (LH.getStrategy()) {
+  case LoopSpawningHints::ST_DAC:
+    ORE->emit(DiagnosticInfoOptimizationFailure(
+                  DEBUG_TYPE, "FailedRequestedSpawning",
+                  L->getStartLoc(), L->getHeader())
+              << "Tapir loop not transformed: "
+              << "failed to use divide-and-conquer loop spawning");
+    break;
+  case LoopSpawningHints::ST_SEQ:
+    ORE->emit(DiagnosticInfoOptimizationFailure(
+                  DEBUG_TYPE, "SpawningDisabled",
+                  L->getStartLoc(), L->getHeader())
+              << "Tapir loop not transformed: "
+              << "loop-spawning transformation disabled");
+    break;
+  case LoopSpawningHints::ST_END:
+    ORE->emit(DiagnosticInfoOptimizationFailure(
+                  DEBUG_TYPE, "FailedRequestedSpawning",
+                  L->getStartLoc(), L->getHeader())
+              << "Tapir loop not transformed: "
+              << "unknown loop-spawning strategy");
+    break;
+  }
+}
+
+/// LoopOutline serves as a base class for different variants of LoopSpawning.
+/// LoopOutline implements common parts of LoopSpawning transformations, namely,
+/// lifting a Tapir loop into a separate helper function.
+class LoopOutline {
+public:
+
+  LoopOutline(Loop *OrigLoop, ScalarEvolution &SE,
+              LoopInfo *LI, DominatorTree *DT,
+              AssumptionCache *AC,
+              OptimizationRemarkEmitter &ORE)
+      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), AC(AC), ORE(ORE),
+        ExitBlock(nullptr)
+  {
+    // Use the loop latch to determine the canonical exit block for this loop.
+    TerminatorInst *TI = OrigLoop->getLoopLatch()->getTerminator();
+    if (2 != TI->getNumSuccessors())
+      return;
+    ExitBlock = TI->getSuccessor(0);
+    if (ExitBlock == OrigLoop->getHeader())
+      ExitBlock = TI->getSuccessor(1);
+  }
+
+  virtual bool processLoop() = 0;
+
+  virtual ~LoopOutline() {}
+
+protected:
+  PHINode* canonicalizeIVs(Type *Ty);
+  Value* canonicalizeLoopLatch(PHINode *IV, Value *Limit);
+  void unlinkLoop();
+
+  /// The original loop.
+  Loop *OrigLoop;
+
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+  /// dynamic knowledge to simplify SCEV expressions and converts them to a
+  /// more usable form.
+  // PredicatedScalarEvolution &PSE;
+  ScalarEvolution &SE;
+  /// Loop info.
+  LoopInfo *LI;
+  /// Dominator tree.
+  DominatorTree *DT;
+  /// Assumption cache.
+  AssumptionCache *AC;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter &ORE;
+
+  /// The exit block of this loop.  We compute our own exit block, based on the
+  /// latch, and handle other exit blocks (i.e., for exception handling) in a
+  /// special manner.
+  BasicBlock *ExitBlock;
+
+// private:
+//   /// Report an analysis message to assist the user in diagnosing loops that are
+//   /// not transformed.  These are handled as LoopAccessReport rather than
+//   /// VectorizationReport because the << operator of LoopSpawningReport returns
+//   /// LoopAccessReport.
+//   void emitAnalysis(const LoopAccessReport &Message) const {
+//     emitAnalysisDiag(OrigLoop, *ORE, Message);
+//   }
+};
+
+/// DACLoopSpawning implements the transformation to spawn the iterations of a
+/// Tapir loop in a recursive divide-and-conquer fashion.
+class DACLoopSpawning : public LoopOutline {
+public:
+  // DACLoopSpawning(Loop *OrigLoop, ScalarEvolution &SE,
+  //                 LoopInfo *LI, DominatorTree *DT,
+  //                 const TargetLibraryInfo *TLI,
+  //                 const TargetTransformInfo *TTI,
+  //                 OptimizationRemarkEmitter *ORE)
+  //     : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT),
+  //       TLI(TLI), TTI(TTI), ORE(ORE)
+  // {}
+
+  DACLoopSpawning(Loop *OrigLoop, ScalarEvolution &SE,
+                  LoopInfo *LI, DominatorTree *DT,
+                  AssumptionCache *AC,
+                  OptimizationRemarkEmitter &ORE)
+      : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE)
+  {}
+
+  bool processLoop();
+
+  virtual ~DACLoopSpawning() {}
+
+protected:
+  Value* computeGrainsize(Value *Limit);
+  void implementDACIterSpawnOnHelper(Function *Helper,
+                                     BasicBlock *Preheader,
+                                     BasicBlock *Header,
+                                     PHINode *CanonicalIV,
+                                     Argument *Limit,
+                                     Argument *Grainsize,
+                                     Instruction *SyncRegion,
+                                     DominatorTree *DT,
+                                     LoopInfo *LI,
+                                     bool CanonicalIVFlagNUW = false,
+                                     bool CanonicalIVFlagNSW = false);
+
+// private:
+//   /// Report an analysis message to assist the user in diagnosing loops that are
+//   /// not transformed.  These are handled as LoopAccessReport rather than
+//   /// VectorizationReport because the << operator of LoopSpawningReport returns
+//   /// LoopAccessReport.
+//   void emitAnalysis(const LoopAccessReport &Message) const {
+//     emitAnalysisDiag(OrigLoop, *ORE, Message);
+//   }
+};
+
+/// CilkABILoopSpawning uses the Cilk Plus ABI to handle Tapir loops.
+class CilkABILoopSpawning : public LoopOutline {
+public:
+  CilkABILoopSpawning(Loop *OrigLoop, ScalarEvolution &SE,
+                      LoopInfo *LI, DominatorTree *DT,
+                      AssumptionCache *AC,
+                      OptimizationRemarkEmitter &ORE)
+      : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE)
+  {}
+
+  bool processLoop();
+
+  virtual ~CilkABILoopSpawning() {}
+
+protected:
+  // PHINode* canonicalizeIVs(Type *Ty);
+  Value* canonicalizeLoopLatch(PHINode *IV, Value *Limit);
+
+// private:
+//   /// Report an analysis message to assist the user in diagnosing loops that are
+//   /// not transformed.  These are handled as LoopAccessReport rather than
+//   /// VectorizationReport because the << operator of LoopSpawningReport returns
+//   /// LoopAccessReport.
+//   void emitAnalysis(const LoopAccessReport &Message) const {
+//     emitAnalysisDiag(OrigLoop, *ORE, Message);
+//   }
+};
+
+struct LoopSpawningImpl {
+  // LoopSpawningImpl(Function &F, LoopInfo &LI, ScalarEvolution &SE,
+  //                  DominatorTree &DT,
+  //                  const TargetTransformInfo &TTI,
+  //                  const TargetLibraryInfo *TLI,
+  //                  AliasAnalysis &AA, AssumptionCache &AC,
+  //                  OptimizationRemarkEmitter &ORE)
+  //     : F(&F), LI(&LI), SE(&SE), DT(&DT), TTI(&TTI), TLI(TLI),
+  //       AA(&AA), AC(&AC), ORE(&ORE) {}
+  // LoopSpawningImpl(Function &F,
+  //                  function_ref<LoopInfo &(Function &)> GetLI,
+  //                  function_ref<ScalarEvolution &(Function &)> GetSE,
+  //                  function_ref<DominatorTree &(Function &)> GetDT,
+  //                  OptimizationRemarkEmitter &ORE)
+  //     : F(F), GetLI(GetLI), LI(nullptr), GetSE(GetSE), GetDT(GetDT),
+  //       ORE(ORE)
+  // {}
+  LoopSpawningImpl(Function &F,
+                   LoopInfo &LI,
+                   ScalarEvolution &SE,
+                   DominatorTree &DT,
+                   AssumptionCache &AC,
+                   OptimizationRemarkEmitter &ORE)
+      : F(F), LI(LI), SE(SE), DT(DT), AC(AC), ORE(ORE) {}
+
+  bool run();
+
+private:
+  void addTapirLoop(Loop *L, SmallVectorImpl<Loop *> &V);
+  bool isTapirLoop(const Loop *L);
+  bool processLoop(Loop *L);
+
+  Function &F;
+  // function_ref<LoopInfo &(Function &)> GetLI;
+  LoopInfo &LI;
+  // function_ref<ScalarEvolution &(Function &)> GetSE;
+  // function_ref<DominatorTree &(Function &)> GetDT;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  // const TargetTransformInfo *TTI;
+  // const TargetLibraryInfo *TLI;
+  // AliasAnalysis *AA;
+  AssumptionCache &AC;
+  OptimizationRemarkEmitter &ORE;
+};
+} // end anonymous namespace
+
+/// Canonicalize the induction variables in the loop.  Return the canonical
+/// induction variable created or inserted by the scalar evolution expander.
+PHINode* LoopOutline::canonicalizeIVs(Type *Ty) {
+  Loop *L = OrigLoop;
+
+  BasicBlock* Header = L->getHeader();
+  Module* M = Header->getParent()->getParent();
+
+  SCEVExpander Exp(SE, M->getDataLayout(), "ls");
+
+  PHINode *CanonicalIV = Exp.getOrInsertCanonicalInductionVariable(L, Ty);
+  DEBUG(dbgs() << "LS Canonical induction variable " << *CanonicalIV << "\n");
+
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
+  Exp.replaceCongruentIVs(L, DT, DeadInsts);
+  for (WeakTrackingVH V : DeadInsts) {
+    DEBUG(dbgs() << "LS erasing dead inst " << *V << "\n");
+    Instruction *I = cast<Instruction>(V);
+    I->eraseFromParent();
+  }
+
+  return CanonicalIV;
+}
+
+/// \brief Replace the latch of the loop to check that IV is always less than or
+/// equal to the limit.
+///
+/// This method assumes that the loop has a single loop latch.
+Value* LoopOutline::canonicalizeLoopLatch(PHINode *IV, Value *Limit) {
+  Loop *L = OrigLoop;
+
+  Value *NewCondition;
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "No single loop latch found for loop.");
+
+  IRBuilder<> Builder(&*Latch->getFirstInsertionPt());
+
+  // This process assumes that IV's increment is in Latch.
+
+  // Create comparison between IV and Limit at top of Latch.
+  NewCondition = Builder.CreateICmpULT(IV, Limit);
+
+  // Replace the conditional branch at the end of Latch.
+  BranchInst *LatchBr = dyn_cast_or_null<BranchInst>(Latch->getTerminator());
+  assert(LatchBr && LatchBr->isConditional() &&
+         "Latch does not terminate with a conditional branch.");
+  Builder.SetInsertPoint(Latch->getTerminator());
+  Builder.CreateCondBr(NewCondition, Header, ExitBlock);
+
+  // Erase the old conditional branch.
+  Value *OldCond = LatchBr->getCondition();
+  LatchBr->eraseFromParent();
+  if (!OldCond->hasNUsesOrMore(1))
+    if (Instruction *OldCondInst = dyn_cast<Instruction>(OldCond))
+      OldCondInst->eraseFromParent();
+
+  return NewCondition;
+}
+
+/// Unlink the specified loop, and update analysis accordingly.  The heavy
+/// lifting of deleting the loop is carried out by a run of LoopDeletion after
+/// this pass.
+void LoopOutline::unlinkLoop() {
+  Loop *L = OrigLoop;
+
+  // Get components of the old loop.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  assert(Preheader && "Loop does not have a unique preheader.");
+  BasicBlock *Latch = L->getLoopLatch();
+
+  // Invalidate the analysis of the old loop.
+  SE.forgetLoop(L);
+
+  // Redirect the preheader to branch directly to loop exit.
+  assert(1 == Preheader->getTerminator()->getNumSuccessors() &&
+         "Preheader does not have a unique successor.");
+  Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(),
+                                                ExitBlock);
+
+  // Rewrite phis in the exit block to get their inputs from
+  // the preheader instead of the exiting block.
+  BasicBlock::iterator BI = ExitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    int j = P->getBasicBlockIndex(Latch);
+    assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
+    P->setIncomingBlock(j, Preheader);
+    P->removeIncomingValue(Latch);
+    ++BI;
+  }
+
+  // Rewrite phis in the header block to not receive an input from
+  // the preheader.
+  BI = L->getHeader()->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    P->removeIncomingValue(Preheader);
+    ++BI;
+  }
+}
+
+/// \brief Compute the grainsize of the loop, based on the limit.
+///
+/// The grainsize is computed by the following equation:
+///
+///     Grainsize = min(2048, ceil(Limit / (8 * workers)))
+///
+/// This computation is inserted into the preheader of the loop.
+///
+/// TODO: This method is the only method that depends on the CilkABI.
+/// Generalize this method for other grainsize calculations and to query TLI.
+Value* DACLoopSpawning::computeGrainsize(Value *Limit) {
+  Loop *L = OrigLoop;
+
+  Value *Grainsize;
+  BasicBlock *Preheader = L->getLoopPreheader();
+  assert(Preheader && "No Preheader found for loop.");
+
+  IRBuilder<> Builder(Preheader->getTerminator());
+
+  // Get 8 * workers
+  Value *Workers8 = Builder.CreateIntCast(cilk::GetOrCreateWorker8(*Preheader->getParent()),
+                                          Limit->getType(), false);
+  // Compute ceil(limit / 8 * workers) = (limit + 8 * workers - 1) / (8 * workers)
+  Value *SmallLoopVal =
+    Builder.CreateUDiv(Builder.CreateSub(Builder.CreateAdd(Limit, Workers8),
+                                         ConstantInt::get(Limit->getType(), 1)),
+                       Workers8);
+  // Compute min
+  Value *LargeLoopVal = ConstantInt::get(Limit->getType(), 2048);
+  Value *Cmp = Builder.CreateICmpULT(LargeLoopVal, SmallLoopVal);
+  Grainsize = Builder.CreateSelect(Cmp, LargeLoopVal, SmallLoopVal);
+
+  return Grainsize;
+}
+
+/// \brief Method to help convertLoopToDACIterSpawn convert the Tapir
+/// loop cloned into function Helper to spawn its iterations in a
+/// parallel divide-and-conquer fashion.
+///
+/// Example: Suppose that Helper contains the following Tapir loop:
+///
+/// Helper(iter_t start, iter_t end, iter_t grain, ...) {
+///   iter_t i = start;
+///   ... Other loop setup ...
+///   do {
+///     spawn { ... loop body ... };
+///   } while (i++ < end);
+///   sync;
+/// }
+///
+/// Then this method transforms Helper into the following form:
+///
+/// Helper(iter_t start, iter_t end, iter_t grain, ...) {
+/// recur:
+///   iter_t itercount = end - start;
+///   if (itercount > grain) {
+///     // Invariant: itercount >= 2
+///     count_t miditer = start + itercount / 2;
+///     spawn Helper(start, miditer, grain, ...);
+///     start = miditer + 1;
+///     goto recur;
+///   }
+///
+///   iter_t i = start;
+///   ... Other loop setup ...
+///   do {
+///     ... Loop Body ...
+///   } while (i++ < end);
+///   sync;
+/// }
+///
+void DACLoopSpawning::implementDACIterSpawnOnHelper(Function *Helper,
+                                                    BasicBlock *Preheader,
+                                                    BasicBlock *Header,
+                                                    PHINode *CanonicalIV,
+                                                    Argument *Limit,
+                                                    Argument *Grainsize,
+                                                    Instruction *SyncRegion,
+                                                    DominatorTree *DT,
+                                                    LoopInfo *LI,
+                                                    bool CanonicalIVFlagNUW,
+                                                    bool CanonicalIVFlagNSW) {
+  // Serialize the cloned copy of the loop.
+  assert(Preheader->getParent() == Helper &&
+         "Preheader does not belong to helper function.");
+  assert(Header->getParent() == Helper &&
+         "Header does not belong to helper function.");
+  assert(CanonicalIV->getParent() == Header &&
+         "CanonicalIV does not belong to header");
+  assert(isa<DetachInst>(Header->getTerminator()) &&
+         "Cloned header is not terminated by a detach.");
+  DetachInst *DI = dyn_cast<DetachInst>(Header->getTerminator());
+  SerializeDetachedCFG(DI, DT);
+
+  // Convert the cloned loop into the strip-mined loop body.
+
+  BasicBlock *DACHead = Preheader;
+  if (&(Helper->getEntryBlock()) == Preheader)
+    // Split the entry block.  We'll want to create a backedge into
+    // the split block later.
+    DACHead = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI);
+
+  BasicBlock *RecurHead, *RecurDet, *RecurCont;
+  Value *IterCount;
+  Value *CanonicalIVInput;
+  PHINode *CanonicalIVStart;
+  {
+    Instruction *PreheaderOrigFront = &(DACHead->front());
+    IRBuilder<> Builder(PreheaderOrigFront);
+    // Create branch based on grainsize.
+    DEBUG(dbgs() << "LS CanonicalIV: " << *CanonicalIV << "\n");
+    CanonicalIVInput = CanonicalIV->getIncomingValueForBlock(DACHead);
+    CanonicalIVStart = Builder.CreatePHI(CanonicalIV->getType(), 2,
+                                         CanonicalIV->getName()+".dac");
+    CanonicalIVInput->replaceAllUsesWith(CanonicalIVStart);
+    IterCount = Builder.CreateSub(Limit, CanonicalIVStart,
+                                  "itercount");
+    Value *IterCountCmp = Builder.CreateICmpUGT(IterCount, Grainsize);
+    TerminatorInst *RecurTerm =
+      SplitBlockAndInsertIfThen(IterCountCmp, PreheaderOrigFront,
+                                /*Unreachable=*/false,
+                                /*BranchWeights=*/nullptr,
+                                DT);
+    RecurHead = RecurTerm->getParent();
+    // Create skeleton of divide-and-conquer recursion:
+    // DACHead -> RecurHead -> RecurDet -> RecurCont -> DACHead
+    RecurDet = SplitBlock(RecurHead, RecurHead->getTerminator(),
+                          DT, LI);
+    RecurCont = SplitBlock(RecurDet, RecurDet->getTerminator(),
+                           DT, LI);
+    RecurCont->getTerminator()->replaceUsesOfWith(RecurTerm->getSuccessor(0),
+                                                  DACHead);
+  }
+
+  // Compute mid iteration in RecurHead.
+  Value *MidIter, *MidIterPlusOne;
+  {
+    IRBuilder<> Builder(&(RecurHead->front()));
+    MidIter = Builder.CreateAdd(CanonicalIVStart,
+                                Builder.CreateLShr(IterCount, 1,
+                                                   "halfcount"),
+                                "miditer",
+                                CanonicalIVFlagNUW, CanonicalIVFlagNSW);
+  }
+
+  // Create recursive call in RecurDet.
+  {
+    // Create input array for recursive call.
+    IRBuilder<> Builder(&(RecurDet->front()));
+    SetVector<Value*> RecurInputs;
+    Function::arg_iterator AI = Helper->arg_begin();
+    assert(cast<Argument>(CanonicalIVInput) == &*AI &&
+           "First argument does not match original input to canonical IV.");
+    RecurInputs.insert(CanonicalIVStart);
+    ++AI;
+    assert(Limit == &*AI &&
+           "Second argument does not match original input to the loop limit.");
+    RecurInputs.insert(MidIter);
+    ++AI;
+    for (Function::arg_iterator AE = Helper->arg_end();
+         AI != AE;  ++AI)
+        RecurInputs.insert(&*AI);
+    // RecurInputs.insert(CanonicalIVStart);
+    // // for (PHINode *IV : IVs)
+    // //   RecurInputs.insert(DACStart[IV]);
+    // RecurInputs.insert(Limit);
+    // RecurInputs.insert(Grainsize);
+    // for (Value *V : BodyInputs)
+    //   RecurInputs.insert(VMap[V]);
+    DEBUG({
+        dbgs() << "RecurInputs: ";
+        for (Value *Input : RecurInputs)
+          dbgs() << *Input << ", ";
+        dbgs() << "\n";
+      });
+
+    // Create call instruction.
+    CallInst *RecurCall = Builder.CreateCall(Helper, RecurInputs.getArrayRef());
+    RecurCall->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    // Use a fast calling convention for the helper.
+    RecurCall->setCallingConv(CallingConv::Fast);
+    // RecurCall->setCallingConv(Helper->getCallingConv());
+    // // Update CG graph with the recursive call we just added.
+    // CG[Helper]->addCalledFunction(RecurCall, CG[Helper]);
+  }
+
+  // Set up continuation of detached recursive call.  We effectively
+  // inline this tail call automatically.
+  {
+    IRBuilder<> Builder(&(RecurCont->front()));
+    MidIterPlusOne = Builder.CreateAdd(MidIter,
+                                       ConstantInt::get(Limit->getType(), 1),
+                                       "miditerplusone",
+                                       CanonicalIVFlagNUW,
+                                       CanonicalIVFlagNSW);
+  }
+
+  // Finish setup of new phi node for canonical IV.
+  {
+    CanonicalIVStart->addIncoming(CanonicalIVInput, Preheader);
+    CanonicalIVStart->addIncoming(MidIterPlusOne, RecurCont);
+  }
+
+  /// Make the recursive DAC parallel.
+  {
+    IRBuilder<> Builder(RecurHead->getTerminator());
+    // Create the detach.
+    DetachInst *DI = Builder.CreateDetach(RecurDet, RecurCont, SyncRegion);
+    DI->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    RecurHead->getTerminator()->eraseFromParent();
+    // Create the reattach.
+    Builder.SetInsertPoint(RecurDet->getTerminator());
+    ReattachInst *RI = Builder.CreateReattach(RecurCont, SyncRegion);
+    RI->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    RecurDet->getTerminator()->eraseFromParent();
+  }
+}
+
+/// Helper routine to get all exit blocks of a loop that are unreachable.
+static void getEHExits(Loop *L, const BasicBlock *DesignatedExitBlock,
+                       SmallVectorImpl<BasicBlock *> &EHExits) {
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+
+  SmallVector<BasicBlock *, 4> WorkList;
+  for (BasicBlock *Exit : ExitBlocks) {
+    if (Exit == DesignatedExitBlock) continue;
+    EHExits.push_back(Exit);
+    WorkList.push_back(Exit);
+  }
+
+  // Traverse the CFG from these frontier blocks to find all blocks involved in
+  // exception-handling exit code.
+  SmallPtrSet<BasicBlock *, 4> Visited;
+  while (!WorkList.empty()) {
+    BasicBlock *BB = WorkList.pop_back_val();
+    if (!Visited.insert(BB).second)
+      continue;
+
+    // Check that the exception handling blocks do not reenter the loop.
+    assert(!L->contains(BB) &&
+           "Exception handling blocks re-enter loop.");
+
+    for (BasicBlock *Succ : successors(BB)) {
+      EHExits.push_back(Succ);
+      WorkList.push_back(Succ);
+    }
+  }
+}
+
+/// Top-level call to convert loop to spawn its iterations in a
+/// divide-and-conquer fashion.
+bool DACLoopSpawning::processLoop() {
+  Loop *L = OrigLoop;
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Latch = L->getLoopLatch();
+
+  DEBUG({
+      LoopBlocksDFS DFS(L);
+      DFS.perform(LI);
+      dbgs() << "Blocks in loop (from DFS):\n";
+      for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+        dbgs() << *BB;
+    });
+
+  using namespace ore;
+
+  // Check that this loop has a valid exit block after the latch.
+  if (!ExitBlock) {
+    DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit",
+                                        L->getStartLoc(),
+                                        Header)
+             << "invalid latch exit");
+    return false;
+  }
+
+  // Get special exits from this loop.
+  SmallVector<BasicBlock *, 4> EHExits;
+  getEHExits(L, ExitBlock, EHExits);
+
+  // Check the exit blocks of the loop.
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+
+  for (const BasicBlock *Exit : ExitBlocks) {
+    if (Exit == ExitBlock) continue;
+    if (Exit->isLandingPad()) {
+      DEBUG({
+          const LandingPadInst *LPI = Exit->getLandingPadInst();
+          dbgs() << "landing pad found: " << *LPI << "\n";
+          for (const User *U : LPI->users())
+            dbgs() << "\tuser " << *U << "\n";
+        });
+    }
+  }
+  SmallPtrSet<BasicBlock *, 4> HandledExits;
+  for (BasicBlock *BB : EHExits)
+    HandledExits.insert(BB);
+  for (BasicBlock *Exit : ExitBlocks) {
+    if (Exit == ExitBlock) continue;
+    if (!HandledExits.count(Exit)) {
+      DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit);
+      ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit",
+                                          L->getStartLoc(),
+                                          Header)
+               << "bad exit block found");
+      return false;
+    }
+  }
+
+  Function *F = Header->getParent();
+  Module* M = F->getParent();
+
+  DEBUG(dbgs() << "LS loop header:" << *Header);
+  DEBUG(dbgs() << "LS loop latch:" << *Latch);
+  DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n");
+
+  /// Get loop limit.
+  const SCEV *Limit = SE.getExitCount(L, Latch);
+  DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n");
+  // PredicatedScalarEvolution PSE(SE, *L);
+  // const SCEV *PLimit = PSE.getExitCount(L, Latch);
+  // DEBUG(dbgs() << "LS predicated loop limit: " << *PLimit << "\n");
+  // emitAnalysis(LoopSpawningReport()
+  //              << "computed loop limit " << *Limit << "\n");
+  if (SE.getCouldNotCompute() == Limit) {
+    DEBUG(dbgs() << "SE could not compute loop limit.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit",
+                                        L->getStartLoc(),
+                                        Header)
+             << "could not compute limit");
+    return false;
+  }
+  // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "LoopLimit", L->getStartLoc(),
+  //                                     Header)
+  //          << "loop limit: " << NV("Limit", Limit));
+  /// Clean up the loop's induction variables.
+  PHINode *CanonicalIV = canonicalizeIVs(Limit->getType());
+  if (!CanonicalIV) {
+    DEBUG(dbgs() << "Could not get canonical IV.\n");
+    // emitAnalysis(LoopSpawningReport()
+    //              << "Could not get a canonical IV.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV",
+                                        L->getStartLoc(),
+                                        Header)
+             << "could not find or create canonical IV");
+    return false;
+  }
+  const SCEVAddRecExpr *CanonicalSCEV =
+    cast<const SCEVAddRecExpr>(SE.getSCEV(CanonicalIV));
+
+  // Remove all IV's other than CanonicalIV.
+  // First, check that we can do this.
+  bool CanRemoveIVs = true;
+  for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+    PHINode *PN = cast<PHINode>(II);
+    if (CanonicalIV == PN) continue;
+    // dbgs() << "IV " << *PN;
+    const SCEV *S = SE.getSCEV(PN);
+    // dbgs() << " SCEV " << *S << "\n";
+    if (SE.getCouldNotCompute() == S) {
+      // emitAnalysis(LoopSpawningReport(PN)
+      //              << "Could not compute the scalar evolution.\n");
+      ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoSCEV", PN)
+               << "could not compute scalar evolution of "
+               << NV("PHINode", PN));
+      CanRemoveIVs = false;
+    }
+  }
+
+  if (!CanRemoveIVs) {
+    DEBUG(dbgs() << "Could not compute scalar evolutions for all IV's.\n");
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // We now have everything we need to extract the loop.  It's time to
+  // do some surgery.
+
+  SCEVExpander Exp(SE, M->getDataLayout(), "ls");
+
+  // Remove the IV's (other than CanonicalIV) and replace them with
+  // their stronger forms.
+  //
+  // TODO?: We can probably adapt this loop->DAC process such that we
+  // don't require all IV's to be canonical.
+  {
+    SmallVector<PHINode*, 8> IVsToRemove;
+    for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+      PHINode *PN = cast<PHINode>(II);
+      if (PN == CanonicalIV) continue;
+      const SCEV *S = SE.getSCEV(PN);
+      DEBUG(dbgs() << "Removing the IV " << *PN << " (" << *S << ")\n");
+      ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "RemoveIV", PN)
+               << "removing the IV "
+               << NV("PHINode", PN));
+      Value *NewIV = Exp.expandCodeFor(S, S->getType(), CanonicalIV);
+      PN->replaceAllUsesWith(NewIV);
+      IVsToRemove.push_back(PN);
+    }
+    for (PHINode *PN : IVsToRemove)
+      PN->eraseFromParent();
+  }
+
+  // All remaining IV's should be canonical.  Collect them.
+  //
+  // TODO?: We can probably adapt this loop->DAC process such that we
+  // don't require all IV's to be canonical.
+  SmallVector<PHINode*, 8> IVs;
+  bool AllCanonical = true;
+  for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+    PHINode *PN = cast<PHINode>(II);
+    DEBUG({
+        const SCEVAddRecExpr *PNSCEV =
+          dyn_cast<const SCEVAddRecExpr>(SE.getSCEV(PN));
+        assert(PNSCEV && "PHINode did not have corresponding SCEVAddRecExpr");
+        assert(PNSCEV->getStart()->isZero() &&
+               "PHINode SCEV does not start at 0");
+        dbgs() << "LS step recurrence for SCEV " << *PNSCEV << " is "
+               << *(PNSCEV->getStepRecurrence(SE)) << "\n";
+        assert(PNSCEV->getStepRecurrence(SE)->isOne() &&
+               "PHINode SCEV step is not 1");
+      });
+    if (ConstantInt *C =
+        dyn_cast<ConstantInt>(PN->getIncomingValueForBlock(Preheader))) {
+      if (C->isZero()) {
+        DEBUG({
+            if (PN != CanonicalIV) {
+              const SCEVAddRecExpr *PNSCEV =
+                dyn_cast<const SCEVAddRecExpr>(SE.getSCEV(PN));
+              dbgs() << "Saving the canonical IV " << *PN << " (" << *PNSCEV << ")\n";
+            }
+          });
+        if (PN != CanonicalIV)
+          ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "SaveIV", PN)
+                   << "saving the canonical the IV "
+                   << NV("PHINode", PN));
+        IVs.push_back(PN);
+      }
+    } else {
+      AllCanonical = false;
+      DEBUG(dbgs() << "Remaining non-canonical PHI Node found: " << *PN <<
+            "\n");
+      // emitAnalysis(LoopSpawningReport(PN)
+      //              << "Found a remaining non-canonical IV.\n");
+      ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NonCanonicalIV", PN)
+               << "found a remaining noncanonical IV");
+    }
+  }
+  if (!AllCanonical)
+    return false;
+
+  // Insert the computation for the loop limit into the Preheader.
+  Value *LimitVar = Exp.expandCodeFor(Limit, Limit->getType(),
+                                      Preheader->getTerminator());
+  DEBUG(dbgs() << "LimitVar: " << *LimitVar << "\n");
+
+  // Canonicalize the loop latch.
+  assert(SE.isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT,
+                                        CanonicalSCEV, Limit) &&
+         "Loop backedge is not guarded by canonical comparison with limit.");
+  Value *NewCond = canonicalizeLoopLatch(CanonicalIV, LimitVar);
+
+  // Insert computation of grainsize into the Preheader.
+  // For debugging:
+  // Value *GrainVar = ConstantInt::get(Limit->getType(), 2);
+  Value *GrainVar = computeGrainsize(LimitVar);
+  DEBUG(dbgs() << "GrainVar: " << *GrainVar << "\n");
+  // emitAnalysis(LoopSpawningReport()
+  //              << "grainsize value " << *GrainVar << "\n");
+  // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UsingGrainsize",
+  //                                     L->getStartLoc(), Header)
+  //          << "grainsize: " << NV("Grainsize", GrainVar));
+
+  /// Clone the loop into a new function.
+
+  // Get the inputs and outputs for the Loop blocks.
+  SetVector<Value *> Inputs, Outputs;
+  SetVector<Value *> BodyInputs, BodyOutputs;
+  ValueToValueMapTy VMap, InputMap;
+  std::vector<BasicBlock *> LoopBlocks;
+  SmallPtrSet<BasicBlock *, 4> ExitsToSplit;
+
+  // Get the sync region containing this Tapir loop.
+  const Instruction *InputSyncRegion;
+  {
+    const DetachInst *DI = cast<DetachInst>(Header->getTerminator());
+    InputSyncRegion = cast<Instruction>(DI->getSyncRegion());
+  }
+
+  // Add start iteration, end iteration, and grainsize to inputs.
+  {
+    LoopBlocks = L->getBlocks();
+    // // Add exit blocks terminated by unreachable.  There should not be any other
+    // // exit blocks in the loop.
+    // SmallSet<BasicBlock *, 4> UnreachableExits;
+    // for (BasicBlock *Exit : ExitBlocks) {
+    //   if (Exit == ExitBlock) continue;
+    //   assert(isa<UnreachableInst>(Exit->getTerminator()) &&
+    //          "Found problematic exit block.");
+    //   UnreachableExits.insert(Exit);
+    // }
+
+    // Add unreachable and exception-handling exits to the set of loop blocks to
+    // clone.
+    DEBUG({
+        dbgs() << "Handled exits of loop:";
+        for (BasicBlock *HE : HandledExits)
+          dbgs() << *HE;
+        dbgs() << "\n";
+      });
+    for (BasicBlock *HE : HandledExits)
+      LoopBlocks.push_back(HE);
+    {
+      const DetachInst *DI = cast<DetachInst>(Header->getTerminator());
+      BasicBlockEdge DetachEdge(Header, DI->getDetached());
+      for (BasicBlock *HE : HandledExits)
+        if (!DT || !DT->dominates(DetachEdge, HE))
+          ExitsToSplit.insert(HE);
+      DEBUG({
+          dbgs() << "Loop exits to split:";
+          for (BasicBlock *ETS : ExitsToSplit)
+            dbgs() << *ETS;
+          dbgs() << "\n";
+        });
+    }
+
+    // DEBUG({
+    //     dbgs() << "LoopBlocks: ";
+    //     for (BasicBlock *LB : LoopBlocks)
+    //       dbgs() << LB->getName() << "("
+    //              << *(LB->getTerminator()) << "), ";
+    //     dbgs() << "\n";
+    //   });
+
+    // Get the inputs and outputs for the loop body.
+    {
+      // CodeExtractor Ext(LoopBlocks, DT);
+      // Ext.findInputsOutputs(BodyInputs, BodyOutputs);
+      SmallPtrSet<BasicBlock *, 32> Blocks;
+      for (BasicBlock *BB : LoopBlocks)
+        Blocks.insert(BB);
+      findInputsOutputs(Blocks, BodyInputs, BodyOutputs, &ExitsToSplit);
+    }
+
+    // Add argument for start of CanonicalIV.
+    DEBUG({
+        Value *CanonicalIVInput =
+          CanonicalIV->getIncomingValueForBlock(Preheader);
+        // CanonicalIVInput should be the constant 0.
+        assert(isa<Constant>(CanonicalIVInput) &&
+               "Input to canonical IV from preheader is not constant.");
+      });
+    Argument *StartArg = new Argument(CanonicalIV->getType(),
+                                      CanonicalIV->getName()+".start");
+    Inputs.insert(StartArg);
+    InputMap[CanonicalIV] = StartArg;
+
+    // Add argument for end.
+    //
+    // In the general case, the loop limit is the result of some computation
+    // that the pass added to the loop's preheader.  In this case, the variable
+    // storing the loop limit is used exactly once, in the canonicalized loop
+    // latch.  In this case, the pass wants to prevent outlining from passing
+    // the loop-limit variable as an arbitrary argument to the outlined
+    // function.  Hence, this pass adds the loop-limit variable as an argument
+    // manually.
+    //
+    // There are two special cases to consider: the loop limit is a constant, or
+    // the loop limit is used elsewhere within the loop.  To handle these two
+    // cases, this pass adds an explict argument for the end of the loop, to
+    // supports the subsequent transformation to using recursive
+    // divide-and-conquer.  After the loop is outlined, this pass will rewrite
+    // the latch in the outlined loop to use this explicit argument.
+    // Furthermore, this pass does not prevent outliner from recognizing the
+    // loop limit as a potential argument to the function.
+    if (isa<Constant>(LimitVar) || !LimitVar->hasOneUse()) {
+      Argument *EndArg = new Argument(LimitVar->getType(), "end");
+      Inputs.insert(EndArg);
+      InputMap[LimitVar] = EndArg;
+    } else {
+      // If the limit var is not constant and has exactly one use, then the
+      // limit var is the result of some nontrivial computation, and that one
+      // use is the new condition inserted.
+      Inputs.insert(LimitVar);
+      InputMap[LimitVar] = LimitVar;
+    }
+
+    // Add argument for grainsize.
+    if (isa<Constant>(GrainVar)) {
+      Argument *GrainArg = new Argument(GrainVar->getType(), "grainsize");
+      Inputs.insert(GrainArg);
+      InputMap[GrainVar] = GrainArg;
+    } else {
+      Inputs.insert(GrainVar);
+      InputMap[GrainVar] = GrainVar;
+    }
+
+    // Put all of the inputs together, and clear redundant inputs from
+    // the set for the loop body.
+    SmallVector<Value *, 8> BodyInputsToRemove;
+    for (Value *V : BodyInputs)
+      if (V == InputSyncRegion)
+        BodyInputsToRemove.push_back(V);
+      else if (!Inputs.count(V))
+        Inputs.insert(V);
+      else
+        BodyInputsToRemove.push_back(V);
+    for (Value *V : BodyInputsToRemove)
+      BodyInputs.remove(V);
+    DEBUG({
+        for (Value *V : BodyInputs)
+          dbgs() << "Remaining body input: " << *V << "\n";
+      });
+    for (Value *V : BodyOutputs)
+      dbgs() << "EL output: " << *V << "\n";
+    assert(0 == BodyOutputs.size() &&
+           "All results from parallel loop should be passed by memory already.");
+  }
+  DEBUG({
+      for (Value *V : Inputs)
+        dbgs() << "EL input: " << *V << "\n";
+      for (Value *V : Outputs)
+        dbgs() << "EL output: " << *V << "\n";
+    });
+
+  // Clone the loop blocks into a new helper function.
+  Function *Helper;
+  {
+    SmallVector<ReturnInst *, 4> Returns;  // Ignore returns cloned.
+
+    // LowerDbgDeclare(*(Header->getParent()));
+
+    Helper = CreateHelper(Inputs, Outputs, LoopBlocks,
+                          Header, Preheader, ExitBlock,
+                          VMap, M,
+                          F->getSubprogram() != nullptr, Returns, ".ls",
+                          &ExitsToSplit, InputSyncRegion,
+                          nullptr, nullptr, nullptr);
+
+    assert(Returns.empty() && "Returns cloned when cloning loop.");
+
+    // Use a fast calling convention for the helper.
+    Helper->setCallingConv(CallingConv::Fast);
+    // Helper->setCallingConv(Header->getParent()->getCallingConv());
+  }
+
+  // Add a sync to the helper's return.
+  BasicBlock *HelperHeader = cast<BasicBlock>(VMap[Header]);
+  {
+    BasicBlock *HelperExit = cast<BasicBlock>(VMap[ExitBlock]);
+    assert(isa<ReturnInst>(HelperExit->getTerminator()));
+    BasicBlock *NewHelperExit = SplitBlock(HelperExit,
+                                           HelperExit->getTerminator(),
+                                           DT, LI);
+    IRBuilder<> Builder(&(HelperExit->front()));
+    SyncInst *NewSync = Builder.CreateSync(
+        NewHelperExit,
+        cast<Instruction>(VMap[InputSyncRegion]));
+    // Set debug info of new sync to match that of terminator of the header of
+    // the cloned loop.
+    NewSync->setDebugLoc(HelperHeader->getTerminator()->getDebugLoc());
+    HelperExit->getTerminator()->eraseFromParent();
+  }
+
+  // // Add syncs to the helper's cloned resume blocks.
+  // for (BasicBlock *BB : Resumes) {
+  //   BasicBlock *HelperResume = cast<BasicBlock>(VMap[BB]);
+  //   assert(isa<ResumeInst>(HelperResume->getTerminator()));
+  //   BasicBlock *NewHelperResume = SplitBlock(HelperResume,
+  //                                            HelperResume->getTerminator(),
+  //                                            DT, LI);
+  //   IRBuilder<> Builder(&(HelperResume->front()));
+  //   SyncInst *NewSync = Builder.CreateSync(NewHelperResume);
+  //   // Set debug info of new sync to match that of terminator of the header of
+  //   // the cloned loop.
+  //   NewSync->setDebugLoc(HelperHeader->getTerminator()->getDebugLoc());
+  //   HelperResume->getTerminator()->eraseFromParent();
+  // }
+
+  BasicBlock *NewPreheader = cast<BasicBlock>(VMap[Preheader]);
+  PHINode *NewCanonicalIV = cast<PHINode>(VMap[CanonicalIV]);
+
+  // Rewrite the cloned IV's to start at the start iteration argument.
+  {
+    // Rewrite clone of canonical IV to start at the start iteration
+    // argument.
+    Argument *NewCanonicalIVStart = cast<Argument>(VMap[InputMap[CanonicalIV]]);
+    {
+      int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader);
+      assert(isa<Constant>(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) &&
+             "Cloned canonical IV does not inherit a constant value from cloned preheader.");
+      NewCanonicalIV->setIncomingValue(NewPreheaderIdx, NewCanonicalIVStart);
+    }
+
+    // Rewrite other cloned IV's to start at their value at the start
+    // iteration.
+    const SCEV *StartIterSCEV = SE.getSCEV(NewCanonicalIVStart);
+    DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n");
+    for (PHINode *IV : IVs) {
+      if (CanonicalIV == IV) continue;
+
+      // Get the value of the IV at the start iteration.
+      DEBUG(dbgs() << "IV " << *IV);
+      const SCEV *IVSCEV = SE.getSCEV(IV);
+      DEBUG(dbgs() << " (SCEV " << *IVSCEV << ")");
+      const SCEVAddRecExpr *IVSCEVAddRec = cast<const SCEVAddRecExpr>(IVSCEV);
+      const SCEV *IVAtIter = IVSCEVAddRec->evaluateAtIteration(StartIterSCEV, SE);
+      DEBUG(dbgs() << " expands at iter " << *StartIterSCEV <<
+            " to " << *IVAtIter << "\n");
+
+      // NOTE: Expanded code should not refer to other IV's.
+      Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(),
+                                         NewPreheader->getTerminator());
+
+
+      // Set the value that the cloned IV inherits from the cloned preheader.
+      PHINode *NewIV = cast<PHINode>(VMap[IV]);
+      int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader);
+      assert(isa<Constant>(NewIV->getIncomingValue(NewPreheaderIdx)) &&
+             "Cloned IV does not inherit a constant value from cloned preheader.");
+      NewIV->setIncomingValue(NewPreheaderIdx, IVStart);
+    }
+
+    // Remap the newly added instructions in the new preheader to use
+    // values local to the helper.
+    for (Instruction &II : *NewPreheader)
+      RemapInstruction(&II, VMap, RF_IgnoreMissingLocals,
+                       /*TypeMapper=*/nullptr, /*Materializer=*/nullptr);
+  }
+
+  // The loop has been outlined by this point.  To handle the special cases
+  // where the loop limit was constant or used elsewhere within the loop, this
+  // pass rewrites the outlined loop-latch condition to use the explicit
+  // end-iteration argument.
+  if (isa<Constant>(LimitVar) || !LimitVar->hasOneUse()) {
+    CmpInst *HelperCond = cast<CmpInst>(VMap[NewCond]);
+    assert(((isa<Constant>(LimitVar) &&
+             HelperCond->getOperand(1) == LimitVar) ||
+            (!LimitVar->hasOneUse() &&
+             HelperCond->getOperand(1) == VMap[LimitVar])) &&
+           "Unexpected condition in loop latch.");
+    IRBuilder<> Builder(HelperCond);
+    Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0),
+                                                 VMap[InputMap[LimitVar]]);
+    HelperCond->replaceAllUsesWith(NewHelperCond);
+    HelperCond->eraseFromParent();
+    DEBUG(dbgs() << "Rewritten Latch: " <<
+          *(cast<Instruction>(NewHelperCond)->getParent()));
+  }
+
+  // DEBUGGING: Simply serialize the cloned loop.
+  // BasicBlock *NewHeader = cast<BasicBlock>(VMap[Header]);
+  // SerializeDetachedCFG(cast<DetachInst>(NewHeader->getTerminator()), nullptr);
+  implementDACIterSpawnOnHelper(Helper, NewPreheader,
+                                cast<BasicBlock>(VMap[Header]),
+                                cast<PHINode>(VMap[CanonicalIV]),
+                                cast<Argument>(VMap[InputMap[LimitVar]]),
+                                cast<Argument>(VMap[InputMap[GrainVar]]),
+                                cast<Instruction>(VMap[InputSyncRegion]),
+                                /*DT=*/nullptr, /*LI=*/nullptr,
+                                CanonicalSCEV->getNoWrapFlags(SCEV::FlagNUW),
+                                CanonicalSCEV->getNoWrapFlags(SCEV::FlagNSW));
+
+  if (verifyFunction(*Helper, &dbgs()))
+    return false;
+
+  // Update allocas in cloned loop body.
+  {
+    // Collect reattach instructions.
+    SmallVector<Instruction *, 4> ReattachPoints;
+    for (pred_iterator PI = pred_begin(Latch), PE = pred_end(Latch);
+         PI != PE; ++PI) {
+      BasicBlock *Pred = *PI;
+      if (!isa<ReattachInst>(Pred->getTerminator())) continue;
+      if (L->contains(Pred))
+        ReattachPoints.push_back(cast<BasicBlock>(VMap[Pred])->getTerminator());
+    }
+    // The cloned loop should be serialized by this point.
+    BasicBlock *ClonedLoopBodyEntry =
+      cast<BasicBlock>(VMap[Header])->getSingleSuccessor();
+    assert(ClonedLoopBodyEntry &&
+           "Head of cloned loop body has multiple successors.");
+    bool ContainsDynamicAllocas =
+      MoveStaticAllocasInBlock(&Helper->getEntryBlock(), ClonedLoopBodyEntry,
+                               ReattachPoints);
+
+    // If the cloned loop contained dynamic alloca instructions, wrap the cloned
+    // loop with llvm.stacksave/llvm.stackrestore intrinsics.
+    if (ContainsDynamicAllocas) {
+      Module *M = Helper->getParent();
+      // Get the two intrinsics we care about.
+      Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
+      Function *StackRestore =
+        Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
+
+      // Insert the llvm.stacksave.
+      CallInst *SavedPtr = IRBuilder<>(&*ClonedLoopBodyEntry,
+                                       ClonedLoopBodyEntry->begin())
+                             .CreateCall(StackSave, {}, "savedstack");
+
+      // Insert a call to llvm.stackrestore before the reattaches in the
+      // original Tapir loop.
+      for (Instruction *ExitPoint : ReattachPoints)
+        IRBuilder<>(ExitPoint).CreateCall(StackRestore, SavedPtr);
+    }
+  }
+
+  if (verifyFunction(*Helper, &dbgs()))
+    return false;
+
+  // Add alignment assumptions to arguments of helper, based on alignment of
+  // values in old function.
+  AddAlignmentAssumptions(F, Inputs, VMap,
+                          Preheader->getTerminator(), AC, DT);
+
+  // Add call to new helper function in original function.
+  {
+    // Setup arguments for call.
+    SmallVector<Value *, 4> TopCallArgs;
+    // Add start iteration 0.
+    assert(CanonicalSCEV->getStart()->isZero() &&
+           "Canonical IV does not start at zero.");
+    TopCallArgs.push_back(ConstantInt::get(CanonicalIV->getType(), 0));
+    // Add loop limit.
+    TopCallArgs.push_back(LimitVar);
+    // Add grainsize.
+    TopCallArgs.push_back(GrainVar);
+    // Add the rest of the arguments.
+    for (Value *V : BodyInputs)
+      TopCallArgs.push_back(V);
+    DEBUG({
+        for (Value *TCArg : TopCallArgs)
+          dbgs() << "Top call arg: " << *TCArg << "\n";
+      });
+
+    // Create call instruction.
+    IRBuilder<> Builder(Preheader->getTerminator());
+    CallInst *TopCall = Builder.CreateCall(Helper,
+                                           ArrayRef<Value *>(TopCallArgs));
+
+    // Use a fast calling convention for the helper.
+    TopCall->setCallingConv(CallingConv::Fast);
+    // TopCall->setCallingConv(Helper->getCallingConv());
+    TopCall->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    // // Update CG graph with the call we just added.
+    // CG[F]->addCalledFunction(TopCall, CG[Helper]);
+  }
+
+  // Remove sync of loop in parent.
+  {
+    // Get the sync region for this loop's detached iterations.
+    DetachInst *HeadDetach = cast<DetachInst>(Header->getTerminator());
+    Value *SyncRegion = HeadDetach->getSyncRegion();
+    // Check the Tapir instructions contained in this sync region.  Look for a
+    // single sync instruction among those Tapir instructions.  Meanwhile,
+    // verify that the only detach instruction in this sync region is the detach
+    // in theloop header.  If these conditions are met, then we assume that the
+    // sync applies to this loop.  Otherwise, something more complicated is
+    // going on, and we give up.
+    SyncInst *LoopSync = nullptr;
+    bool SingleSyncJustForLoop = true;
+    for (User *U : SyncRegion->users()) {
+      // Skip the detach in the loop header.
+      if (HeadDetach == U) continue;
+      // Remember the first sync instruction we find.  If we find multiple sync
+      // instructions, then something nontrivial is going on.
+      if (SyncInst *SI = dyn_cast<SyncInst>(U)) {
+        if (!LoopSync)
+          LoopSync = SI;
+        else
+          SingleSyncJustForLoop = false;
+      }
+      // If we find a detach instruction that is not the loop header's, then
+      // something nontrivial is going on.
+      if (isa<DetachInst>(U))
+        SingleSyncJustForLoop = false;
+    }
+    if (LoopSync && SingleSyncJustForLoop)
+      // Replace the sync with a branch.
+      ReplaceInstWithInst(LoopSync,
+                          BranchInst::Create(LoopSync->getSuccessor(0)));
+    else if (!LoopSync)
+      DEBUG(dbgs() << "No sync found for this loop.");
+    else
+      DEBUG(dbgs() << "No single sync found that only affects this loop.");
+  }
+
+  ++LoopsConvertedToDAC;
+
+  unlinkLoop();
+
+  return Helper;
+}
+
+/// \brief Replace the latch of the loop to check that IV is always less than or
+/// equal to the limit.
+///
+/// This method assumes that the loop has a single loop latch.
+Value* CilkABILoopSpawning::canonicalizeLoopLatch(PHINode *IV, Value *Limit) {
+  Loop *L = OrigLoop;
+
+  Value *NewCondition;
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "No single loop latch found for loop.");
+
+  IRBuilder<> Builder(&*Latch->getFirstInsertionPt());
+
+  // This process assumes that IV's increment is in Latch.
+
+  // Create comparison between IV and Limit at top of Latch.
+  NewCondition =
+    Builder.CreateICmpULT(Builder.CreateAdd(IV,
+                                            ConstantInt::get(IV->getType(), 1)),
+                          Limit);
+
+  // Replace the conditional branch at the end of Latch.
+  BranchInst *LatchBr = dyn_cast_or_null<BranchInst>(Latch->getTerminator());
+  assert(LatchBr && LatchBr->isConditional() &&
+         "Latch does not terminate with a conditional branch.");
+  Builder.SetInsertPoint(Latch->getTerminator());
+  Builder.CreateCondBr(NewCondition, Header, ExitBlock);
+
+  // Erase the old conditional branch.
+  LatchBr->eraseFromParent();
+
+  return NewCondition;
+}
+
+/// Top-level call to convert a Tapir loop to be processed using an appropriate
+/// Cilk ABI call.
+bool CilkABILoopSpawning::processLoop() {
+  Loop *L = OrigLoop;
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Latch = L->getLoopLatch();
+
+  using namespace ore;
+
+  // Check the exit blocks of the loop.
+  if (!ExitBlock) {
+    DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit",
+                                        L->getStartLoc(),
+                                        Header)
+             << "invalid latch exit");
+    return false;
+  }
+
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  for (const BasicBlock *Exit : ExitBlocks) {
+    if (Exit == ExitBlock) continue;
+    if (!isa<UnreachableInst>(Exit->getTerminator())) {
+      DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit);
+      ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit",
+                                          L->getStartLoc(),
+                                          Header)
+               << "bad exit block found");
+      return false;
+    }
+  }
+
+  Function *F = Header->getParent();
+  Module* M = F->getParent();
+
+  DEBUG(dbgs() << "LS loop header:" << *Header);
+  DEBUG(dbgs() << "LS loop latch:" << *Latch);
+
+  // DEBUG(dbgs() << "LS SE backedge taken count: " << *(SE.getBackedgeTakenCount(L)) << "\n");
+  // DEBUG(dbgs() << "LS SE max backedge taken count: " << *(SE.getMaxBackedgeTakenCount(L)) << "\n");
+  DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n");
+
+  /// Get loop limit.
+  const SCEV *BETC = SE.getExitCount(L, Latch);
+  const SCEV *Limit = SE.getAddExpr(BETC, SE.getOne(BETC->getType()));
+  DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n");
+  // PredicatedScalarEvolution PSE(SE, *L);
+  // const SCEV *PLimit = PSE.getExitCount(L, Latch);
+  // DEBUG(dbgs() << "LS predicated loop limit: " << *PLimit << "\n");
+  // emitAnalysis(LoopSpawningReport()
+  //              << "computed loop limit " << *Limit << "\n");
+  if (SE.getCouldNotCompute() == Limit) {
+    DEBUG(dbgs() << "SE could not compute loop limit.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit",
+                                        L->getStartLoc(),
+                                        Header)
+             << "could not compute limit");
+    return false;
+  }
+  // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "LoopLimit", L->getStartLoc(),
+  //                                     Header)
+  //          << "loop limit: " << NV("Limit", Limit));
+  /// Clean up the loop's induction variables.
+  PHINode *CanonicalIV = canonicalizeIVs(Limit->getType());
+  if (!CanonicalIV) {
+    DEBUG(dbgs() << "Could not get canonical IV.\n");
+    // emitAnalysis(LoopSpawningReport()
+    //              << "Could not get a canonical IV.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV",
+                                        L->getStartLoc(),
+                                        Header)
+             << "could not find or create canonical IV");
+    return false;
+  }
+  const SCEVAddRecExpr *CanonicalSCEV =
+    cast<const SCEVAddRecExpr>(SE.getSCEV(CanonicalIV));
+
+  // Remove all IV's other can CanonicalIV.
+  // First, check that we can do this.
+  bool CanRemoveIVs = true;
+  for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+    PHINode *PN = cast<PHINode>(II);
+    if (CanonicalIV == PN) continue;
+    // dbgs() << "IV " << *PN;
+    const SCEV *S = SE.getSCEV(PN);
+    // dbgs() << " SCEV " << *S << "\n";
+    if (SE.getCouldNotCompute() == S) {
+      // emitAnalysis(LoopSpawningReport(PN)
+      //              << "Could not compute the scalar evolution.\n");
+      ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoSCEV", PN)
+               << "could not compute scalar evolution of "
+               << NV("PHINode", PN));
+      CanRemoveIVs = false;
+    }
+  }
+
+  if (!CanRemoveIVs) {
+    DEBUG(dbgs() << "Could not compute scalar evolutions for all IV's.\n");
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // We now have everything we need to extract the loop.  It's time to
+  // do some surgery.
+
+  SCEVExpander Exp(SE, M->getDataLayout(), "ls");
+
+  // Remove the IV's (other than CanonicalIV) and replace them with
+  // their stronger forms.
+  //
+  // TODO?: We can probably adapt this process such that we don't require all
+  // IV's to be canonical.
+  {
+    SmallVector<PHINode*, 8> IVsToRemove;
+    for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+      PHINode *PN = cast<PHINode>(II);
+      if (PN == CanonicalIV) continue;
+      const SCEV *S = SE.getSCEV(PN);
+      Value *NewIV = Exp.expandCodeFor(S, S->getType(), CanonicalIV);
+      PN->replaceAllUsesWith(NewIV);
+      IVsToRemove.push_back(PN);
+    }
+    for (PHINode *PN : IVsToRemove)
+      PN->eraseFromParent();
+  }
+
+  // All remaining IV's should be canonical.  Collect them.
+  //
+  // TODO?: We can probably adapt this process such that we don't require all
+  // IV's to be canonical.
+  SmallVector<PHINode*, 8> IVs;
+  bool AllCanonical = true;
+  for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+    PHINode *PN = cast<PHINode>(II);
+    DEBUG({
+        const SCEVAddRecExpr *PNSCEV =
+          dyn_cast<const SCEVAddRecExpr>(SE.getSCEV(PN));
+        assert(PNSCEV && "PHINode did not have corresponding SCEVAddRecExpr");
+        assert(PNSCEV->getStart()->isZero() &&
+               "PHINode SCEV does not start at 0");
+        dbgs() << "LS step recurrence for SCEV " << *PNSCEV << " is "
+               << *(PNSCEV->getStepRecurrence(SE)) << "\n";
+        assert(PNSCEV->getStepRecurrence(SE)->isOne() &&
+               "PHINode SCEV step is not 1");
+      });
+    if (ConstantInt *C =
+        dyn_cast<ConstantInt>(PN->getIncomingValueForBlock(Preheader))) {
+      if (C->isZero())
+        IVs.push_back(PN);
+    } else {
+      AllCanonical = false;
+      DEBUG(dbgs() << "Remaining non-canonical PHI Node found: " << *PN << "\n");
+      // emitAnalysis(LoopSpawningReport(PN)
+      //              << "Found a remaining non-canonical IV.\n");
+      ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NonCanonicalIV", PN)
+               << "found a remaining noncanonical IV");
+    }
+  }
+  if (!AllCanonical)
+    return false;
+
+  // Insert the computation for the loop limit into the Preheader.
+  Value *LimitVar = Exp.expandCodeFor(Limit, Limit->getType(),
+                                      Preheader->getTerminator());
+  DEBUG(dbgs() << "LimitVar: " << *LimitVar << "\n");
+
+  // Canonicalize the loop latch.
+  Value *NewCond = canonicalizeLoopLatch(CanonicalIV, LimitVar);
+
+  /// Clone the loop into a new function.
+
+  // Get the inputs and outputs for the Loop blocks.
+  SetVector<Value*> Inputs, Outputs;
+  SetVector<Value*> BodyInputs, BodyOutputs;
+  ValueToValueMapTy VMap, InputMap;
+  std::vector<BasicBlock *> LoopBlocks;
+  AllocaInst* closure;
+  // Add start iteration, end iteration, and grainsize to inputs.
+  {
+    LoopBlocks = L->getBlocks();
+    // // Add exit blocks terminated by unreachable.  There should not be any other
+    // // exit blocks in the loop.
+    // SmallSet<BasicBlock *, 4> UnreachableExits;
+    // for (BasicBlock *Exit : ExitBlocks) {
+    //   if (Exit == ExitBlock) continue;
+    //   assert(isa<UnreachableInst>(Exit->getTerminator()) &&
+    //          "Found problematic exit block.");
+    //   UnreachableExits.insert(Exit);
+    // }
+
+    // // Add unreachable and exception-handling exits to the set of loop blocks to
+    // // clone.
+    // for (BasicBlock *BB : UnreachableExits)
+    //   LoopBlocks.push_back(BB);
+    // for (BasicBlock *BB : EHExits)
+    //   LoopBlocks.push_back(BB);
+
+    // DEBUG({
+    //     dbgs() << "LoopBlocks: ";
+    //     for (BasicBlock *LB : LoopBlocks)
+    //       dbgs() << LB->getName() << "("
+    //              << *(LB->getTerminator()) << "), ";
+    //     dbgs() << "\n";
+    //   });
+
+    // Get the inputs and outputs for the loop body.
+    {
+      // CodeExtractor Ext(LoopBlocks, DT);
+      // Ext.findInputsOutputs(BodyInputs, BodyOutputs);
+      SmallPtrSet<BasicBlock *, 32> Blocks;
+      for (BasicBlock *BB : LoopBlocks)
+        Blocks.insert(BB);
+      findInputsOutputs(Blocks, BodyInputs, BodyOutputs);
+    }
+
+    // Add argument for start of CanonicalIV.
+    DEBUG({
+        Value *CanonicalIVInput =
+          CanonicalIV->getIncomingValueForBlock(Preheader);
+        // CanonicalIVInput should be the constant 0.
+        assert(isa<Constant>(CanonicalIVInput) &&
+               "Input to canonical IV from preheader is not constant.");
+      });
+    Argument *StartArg = new Argument(CanonicalIV->getType(),
+                                      CanonicalIV->getName()+".start");
+    Inputs.insert(StartArg);
+    InputMap[CanonicalIV] = StartArg;
+
+    // Add argument for end.
+    Value* ea;
+    if (isa<Constant>(LimitVar)) {
+      Argument *EndArg = new Argument(LimitVar->getType(), "end");
+      Inputs.insert(EndArg);
+      ea = InputMap[LimitVar] = EndArg;
+    } else {
+      Inputs.insert(LimitVar);
+      ea = InputMap[LimitVar] = LimitVar;
+    }
+
+    // Put all of the inputs together, and clear redundant inputs from
+    // the set for the loop body.
+    SmallVector<Value*, 8> BodyInputsToRemove;
+    SmallVector<Value*, 8> StructInputs;
+    SmallVector<Type*, 8> StructIT;
+    for (Value *V : BodyInputs) {
+      if (!Inputs.count(V)) {
+        StructInputs.push_back(V);
+        StructIT.push_back(V->getType());
+      }
+      else
+        BodyInputsToRemove.push_back(V);
+    }
+    StructType* ST = StructType::create(StructIT);
+    IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+    IRBuilder<> B2(L->getHeader()->getFirstNonPHIOrDbgOrLifetime());
+    closure = B.CreateAlloca(ST);
+    for(unsigned i=0; i<StructInputs.size(); i++) {
+      B.CreateStore(StructInputs[i], B.CreateConstGEP2_32(ST, closure, 0, i));
+      auto l2 = B2.CreateLoad(B2.CreateConstGEP2_32(ST, closure, 0, i));
+      auto UI = StructInputs[i]->use_begin(), E = StructInputs[i]->use_end();
+      for (; UI != E;) {
+        Use &U = *UI;
+        ++UI;
+        auto *Usr = dyn_cast<Instruction>(U.getUser());
+        if (Usr && !L->contains(Usr->getParent()))
+          continue;
+        U.set(l2);
+      }
+    }
+    Inputs.insert(closure);
+    //llvm::errs() << "<B>\n";
+    //for(auto& a : Inputs) a->dump();
+    //llvm::errs() << "</B>\n";
+    //StartArg->dump();
+    //ea->dump();
+    Inputs.remove(StartArg);
+    Inputs.insert(StartArg);
+    Inputs.remove(ea);
+    Inputs.insert(ea);
+    //llvm::errs() << "<A>\n";
+    //for(auto& a : Inputs) a->dump();
+    //llvm::errs() << "</A>\n";
+    for (Value *V : BodyInputsToRemove)
+      BodyInputs.remove(V);
+    assert(0 == BodyOutputs.size() &&
+           "All results from parallel loop should be passed by memory already.");
+  }
+  DEBUG({
+      for (Value *V : Inputs)
+        dbgs() << "EL input: " << *V << "\n";
+      for (Value *V : Outputs)
+        dbgs() << "EL output: " << *V << "\n";
+    });
+
+
+  Function *Helper;
+  {
+    SmallVector<ReturnInst *, 4> Returns;  // Ignore returns cloned.
+
+    // LowerDbgDeclare(*(Header->getParent()));
+
+    Helper = CreateHelper(Inputs, Outputs, L->getBlocks(),
+                          Header, Preheader, ExitBlock/*L->getExitBlock()*/,
+                          VMap, M,
+                          F->getSubprogram() != nullptr, Returns, ".ls",
+                          nullptr, nullptr, nullptr);
+
+    assert(Returns.empty() && "Returns cloned when cloning loop.");
+
+    // Use a fast calling convention for the helper.
+    //Helper->setCallingConv(CallingConv::Fast);
+    // Helper->setCallingConv(Header->getParent()->getCallingConv());
+  }
+
+  BasicBlock *NewPreheader = cast<BasicBlock>(VMap[Preheader]);
+  PHINode *NewCanonicalIV = cast<PHINode>(VMap[CanonicalIV]);
+
+  // Rewrite the cloned IV's to start at the start iteration argument.
+  {
+    // Rewrite clone of canonical IV to start at the start iteration
+    // argument.
+    Argument *NewCanonicalIVStart = cast<Argument>(VMap[InputMap[CanonicalIV]]);
+    {
+      int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader);
+      assert(isa<Constant>(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) &&
+             "Cloned canonical IV does not inherit a constant value from cloned preheader.");
+      NewCanonicalIV->setIncomingValue(NewPreheaderIdx, NewCanonicalIVStart);
+    }
+
+    // Rewrite other cloned IV's to start at their value at the start
+    // iteration.
+    const SCEV *StartIterSCEV = SE.getSCEV(NewCanonicalIVStart);
+    DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n");
+    for (PHINode *IV : IVs) {
+      if (CanonicalIV == IV) continue;
+
+      // Get the value of the IV at the start iteration.
+      DEBUG(dbgs() << "IV " << *IV);
+      const SCEV *IVSCEV = SE.getSCEV(IV);
+      DEBUG(dbgs() << " (SCEV " << *IVSCEV << ")");
+      const SCEVAddRecExpr *IVSCEVAddRec = cast<const SCEVAddRecExpr>(IVSCEV);
+      const SCEV *IVAtIter = IVSCEVAddRec->evaluateAtIteration(StartIterSCEV, SE);
+      DEBUG(dbgs() << " expands at iter " << *StartIterSCEV <<
+            " to " << *IVAtIter << "\n");
+
+      // NOTE: Expanded code should not refer to other IV's.
+      Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(),
+                                         NewPreheader->getTerminator());
+
+
+      // Set the value that the cloned IV inherits from the cloned preheader.
+      PHINode *NewIV = cast<PHINode>(VMap[IV]);
+      int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader);
+      assert(isa<Constant>(NewIV->getIncomingValue(NewPreheaderIdx)) &&
+             "Cloned IV does not inherit a constant value from cloned preheader.");
+      NewIV->setIncomingValue(NewPreheaderIdx, IVStart);
+    }
+
+    // Remap the newly added instructions in the new preheader to use
+    // values local to the helper.
+    for (Instruction &II : *NewPreheader)
+      RemapInstruction(&II, VMap, RF_IgnoreMissingLocals,
+                       /*TypeMapper=*/nullptr, /*Materializer=*/nullptr);
+  }
+
+  // If the loop limit is constant, then rewrite the loop latch
+  // condition to use the end-iteration argument.
+  if (isa<Constant>(LimitVar)) {
+    CmpInst *HelperCond = cast<CmpInst>(VMap[NewCond]);
+    assert(HelperCond->getOperand(1) == LimitVar);
+    IRBuilder<> Builder(HelperCond);
+    Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0),
+                                                 VMap[InputMap[LimitVar]]);
+    HelperCond->replaceAllUsesWith(NewHelperCond);
+    HelperCond->eraseFromParent();
+  }
+
+  // For debugging:
+  BasicBlock *NewHeader = cast<BasicBlock>(VMap[Header]);
+  SerializeDetachedCFG(cast<DetachInst>(NewHeader->getTerminator()), nullptr);
+  {
+    Value* v = &*Helper->arg_begin();
+    auto UI = v->use_begin(), E = v->use_end();
+    for (; UI != E;) {
+      Use &U = *UI;
+      ++UI;
+      auto *Usr = dyn_cast<Instruction>(U.getUser());
+      Usr->moveBefore(Helper->getEntryBlock().getTerminator());
+
+      auto UI2 = Usr->use_begin(), E2 = Usr->use_end();
+      for (; UI2 != E2;) {
+        Use &U2 = *UI2;
+        ++UI2;
+        auto *Usr2 = dyn_cast<Instruction>(U2.getUser());
+        Usr2->moveBefore(Helper->getEntryBlock().getTerminator());
+      }
+    }
+  }
+
+  if (verifyFunction(*Helper, &dbgs()))
+    return false;
+
+  // Add call to new helper function in original function.
+  {
+    // Setup arguments for call.
+    SetVector<Value*> TopCallArgs;
+    // Add start iteration 0.
+    assert(CanonicalSCEV->getStart()->isZero() &&
+           "Canonical IV does not start at zero.");
+    TopCallArgs.insert(ConstantInt::get(CanonicalIV->getType(), 0));
+    // Add loop limit.
+    TopCallArgs.insert(LimitVar);
+    // Add grainsize.
+    //TopCallArgs.insert(GrainVar);
+    // Add the rest of the arguments.
+    for (Value *V : BodyInputs)
+      TopCallArgs.insert(V);
+
+    // Create call instruction.
+    IRBuilder<> Builder(Preheader->getTerminator());
+
+    llvm::Function* F;
+    if( ((llvm::IntegerType*)LimitVar->getType())->getBitWidth() == 32 )
+      F = CILKRTS_FUNC(cilk_for_32, *M);
+    else {
+      assert( ((llvm::IntegerType*)LimitVar->getType())->getBitWidth() == 64 );
+      F = CILKRTS_FUNC(cilk_for_64, *M);
+    }
+    llvm::Value* args[] = {
+      Builder.CreatePointerCast(Helper, F->getFunctionType()->getParamType(0)),
+      Builder.CreatePointerCast(closure, F->getFunctionType()->getParamType(1)),
+      LimitVar,
+      ConstantInt::get(IntegerType::get(F->getContext(), sizeof(int)*8),0)
+    };
+
+    /*CallInst *TopCall = */Builder.CreateCall(F, args);
+
+    // Use a fast calling convention for the helper.
+    //TopCall->setCallingConv(CallingConv::Fast);
+    // TopCall->setCallingConv(Helper->getCallingConv());
+    //TopCall->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    // // Update CG graph with the call we just added.
+    // CG[F]->addCalledFunction(TopCall, CG[Helper]);
+  }
+
+  ++LoopsConvertedToCilkABI;
+
+  unlinkLoop();
+
+  return Helper;
+}
+
+/// Checks if this loop is a Tapir loop.  Right now we check that the loop is
+/// in a canonical form:
+/// 1) The header detaches the body.
+/// 2) The loop contains a single latch.
+/// 3) The body reattaches to the latch (which is necessary for a valid
+///    detached CFG).
+/// 4) The loop only branches to the exit block from the header or the latch.
+bool LoopSpawningImpl::isTapirLoop(const Loop *L) {
+  const BasicBlock *Header = L->getHeader();
+  const BasicBlock *Latch = L->getLoopLatch();
+  // const BasicBlock *Exit = L->getExitBlock();
+
+  // DEBUG(dbgs() << "LS checking if Tapir loop: " << *L);
+
+  // Header must be terminated by a detach.
+  if (!isa<DetachInst>(Header->getTerminator())) {
+    DEBUG(dbgs() << "LS loop header is not terminated by a detach: " << *L << "\n");
+    return false;
+  }
+
+  // Loop must have a unique latch.
+  if (nullptr == Latch) {
+    DEBUG(dbgs() << "LS loop does not have a unique latch: " << *L << "\n");
+    return false;
+  }
+
+  // // Loop must have a unique exit block.
+  // if (nullptr == Exit) {
+  //   DEBUG(dbgs() << "LS loop does not have a unique exit block: " << *L << "\n");
+  //   SmallVector<BasicBlock *, 4> ExitBlocks;
+  //   L->getUniqueExitBlocks(ExitBlocks);
+  //   for (BasicBlock *Exit : ExitBlocks)
+  //     DEBUG(dbgs() << *Exit);
+  //   return false;
+  // }
+
+  // Continuation of header terminator must be the latch.
+  const DetachInst *HeaderDetach = cast<DetachInst>(Header->getTerminator());
+  const BasicBlock *Continuation = HeaderDetach->getContinue();
+  if (Continuation != Latch) {
+    DEBUG(dbgs() << "LS continuation of detach in header is not the latch: "
+                 << *L << "\n");
+    return false;
+  }
+
+  // All other predecessors of Latch are terminated by reattach instructions.
+  for (auto PI = pred_begin(Latch), PE = pred_end(Latch);  PI != PE; ++PI) {
+    const BasicBlock *Pred = *PI;
+    if (Header == Pred) continue;
+    if (!isa<ReattachInst>(Pred->getTerminator())) {
+      DEBUG(dbgs() << "LS Latch has a predecessor that is not terminated "
+                   << "by a reattach: " << *L << "\n");
+      return false;
+    }
+  }
+
+  // Get the exit block from Latch.
+  BasicBlock *Exit = Latch->getTerminator()->getSuccessor(0);
+  if (Header == Exit)
+    Exit = Latch->getTerminator()->getSuccessor(1);
+
+  // The only predecessors of Exit inside the loop are Header and Latch.
+  for (auto PI = pred_begin(Exit), PE = pred_end(Exit);  PI != PE; ++PI) {
+    const BasicBlock *Pred = *PI;
+    if (!L->contains(Pred))
+      continue;
+    if (Header != Pred && Latch != Pred) {
+      DEBUG(dbgs() << "LS Loop branches to exit block from a block "
+                   << "other than the header or latch" << *L << "\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/// This routine recursively examines all descendants of the specified loop and
+/// adds all Tapir loops in that tree to the vector.  This routine performs a
+/// pre-order traversal of the tree of loops and pushes each Tapir loop found
+/// onto the end of the vector.
+void LoopSpawningImpl::addTapirLoop(Loop *L, SmallVectorImpl<Loop *> &V) {
+  if (isTapirLoop(L)) {
+    V.push_back(L);
+    return;
+  }
+
+  LoopSpawningHints Hints(L, ORE);
+
+  DEBUG(dbgs() << "LS: Loop hints:"
+               << " strategy = " << Hints.printStrategy(Hints.getStrategy())
+               << "\n");
+
+  using namespace ore;
+
+  if (LoopSpawningHints::ST_SEQ != Hints.getStrategy()) {
+    DEBUG(dbgs() << "LS: Marked loop is not a valid Tapir loop.\n"
+          << "\tLoop hints:"
+          << " strategy = " << Hints.printStrategy(Hints.getStrategy())
+          << "\n");
+    ORE.emit(OptimizationRemarkMissed(LS_NAME, "NotTapir",
+                                      L->getStartLoc(), L->getHeader())
+             << "marked loop is not a valid Tapir loop");
+  }
+
+  for (Loop *InnerL : *L)
+    addTapirLoop(InnerL, V);
+}
+
+#ifndef NDEBUG
+/// \return string containing a file name and a line # for the given loop.
+static std::string getDebugLocString(const Loop *L) {
+  std::string Result;
+  if (L) {
+    raw_string_ostream OS(Result);
+    if (const DebugLoc LoopDbgLoc = L->getStartLoc())
+      LoopDbgLoc.print(OS);
+    else
+      // Just print the module name.
+      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
+    OS.flush();
+  }
+  return Result;
+}
+#endif
+
+bool LoopSpawningImpl::run() {
+  // Build up a worklist of inner-loops to vectorize. This is necessary as
+  // the act of vectorizing or partially unrolling a loop creates new loops
+  // and can invalidate iterators across the loops.
+  SmallVector<Loop *, 8> Worklist;
+
+  // Examine all top-level loops in this function, and call addTapirLoop to push
+  // those loops onto the work list.
+  for (Loop *L : LI)
+    addTapirLoop(L, Worklist);
+
+  LoopsAnalyzed += Worklist.size();
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  while (!Worklist.empty())
+    // Process the work list of loops backwards.  For each tree of loops in this
+    // function, addTapirLoop pushed those loops onto the work list according to
+    // a pre-order tree traversal.  Therefore, processing the work list
+    // backwards leads us to process innermost loops first.
+    Changed |= processLoop(Worklist.pop_back_val());
+
+  // Process each loop nest in the function.
+  return Changed;
+}
+
+// Top-level routine to process a given loop.
+bool LoopSpawningImpl::processLoop(Loop *L) {
+#ifndef NDEBUG
+  const std::string DebugLocStr = getDebugLocString(L);
+#endif /* NDEBUG */
+
+  // Function containing loop
+  Function *F = L->getHeader()->getParent();
+
+  DEBUG(dbgs() << "\nLS: Checking a Tapir loop in \""
+               << L->getHeader()->getParent()->getName() << "\" from "
+        << DebugLocStr << ": " << *L << "\n");
+
+  LoopSpawningHints Hints(L, ORE);
+
+  DEBUG(dbgs() << "LS: Loop hints:"
+               << " strategy = " << Hints.printStrategy(Hints.getStrategy())
+               << "\n");
+
+  using namespace ore;
+
+  // Get the loop preheader.  LoopSimplify should guarantee that the loop
+  // preheader is not terminated by a sync.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    DEBUG(dbgs() << "LS: Loop lacks a preheader.\n");
+    ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoPreheader",
+                                      L->getStartLoc(), L->getHeader())
+             << "loop lacks a preheader");
+    emitMissedWarning(F, L, Hints, &ORE);
+    return false;
+  } else if (!isa<BranchInst>(Preheader->getTerminator())) {
+    DEBUG(dbgs() << "LS: Loop preheader is not terminated by a branch.\n");
+    ORE.emit(OptimizationRemarkMissed(LS_NAME, "ComplexPreheader",
+                                      L->getStartLoc(), L->getHeader())
+             << "loop preheader not terminated by a branch");
+    emitMissedWarning(F, L, Hints, &ORE);
+    return false;
+  }
+
+  switch(Hints.getStrategy()) {
+  case LoopSpawningHints::ST_SEQ:
+    DEBUG(dbgs() << "LS: Hints dictate sequential spawning.\n");
+    break;
+  case LoopSpawningHints::ST_DAC:
+    DEBUG(dbgs() << "LS: Hints dictate DAC spawning.\n");
+    {
+      DebugLoc DLoc = L->getStartLoc();
+      BasicBlock *Header = L->getHeader();
+      DACLoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE);
+      // CilkABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE);
+      // DACLoopSpawning DLS(L, SE, LI, DT, TLI, TTI, ORE);
+      if (DLS.processLoop()) {
+        DEBUG({
+            if (verifyFunction(*L->getHeader()->getParent())) {
+              dbgs() << "Transformed function is invalid.\n";
+              return false;
+            }
+          });
+        // Report success.
+        ORE.emit(OptimizationRemark(LS_NAME, "DACSpawning", DLoc, Header)
+                 << "spawning iterations using divide-and-conquer");
+        return true;
+      } else {
+        // Report failure.
+        ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoDACSpawning", DLoc,
+                                          Header)
+                 << "cannot spawn iterations using divide-and-conquer");
+        emitMissedWarning(F, L, Hints, &ORE);
+        return false;
+      }
+    }
+    break;
+  case LoopSpawningHints::ST_END:
+    dbgs() << "LS: Hints specify unknown spawning strategy.\n";
+    break;
+  }
+  return false;
+}
+
+// PreservedAnalyses LoopSpawningPass::run(Module &M, ModuleAnalysisManager &AM) {
+//   // Find functions that detach for processing.
+//   SmallVector<Function *, 4> WorkList;
+//   for (Function &F : M)
+//     for (BasicBlock &BB : F)
+//       if (isa<DetachInst>(BB.getTerminator()))
+//         WorkList.push_back(&F);
+
+//   if (WorkList.empty())
+//     return PreservedAnalyses::all();
+
+//   bool Changed = false;
+//   while (!WorkList.empty()) {
+//     Function *F = WorkList.back();
+//     auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+//     auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+//     auto &LI = FAM.getResult<LoopAnalysis>(*F);
+//     auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(*F);
+//     auto &DT = FAM.getResult<DominatorTreeAnalysis>(*F);
+//     auto &TTI = FAM.getResult<TargetIRAnalysis>(*F);
+//     auto &AA = FAM.getResult<AAManager>(*F);
+//     auto &AC = FAM.getResult<AssumptionAnalysis>(*F);
+//     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+//     LoopSpawningImpl Impl(*F, LI, SE, DT, TTI, &TLI, AA, AC, ORE);
+//     Changed |= Impl.run();
+//     WorkList.pop_back();
+//   }
+
+//   if (Changed)
+//     return PreservedAnalyses::none();
+//   return PreservedAnalyses::all();
+// }
+
+PreservedAnalyses LoopSpawningPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+  // Determine if function detaches.
+  bool DetachingFunction = false;
+  for (BasicBlock &BB : F)
+    if (isa<DetachInst>(BB.getTerminator()))
+      DetachingFunction = true;
+
+  if (!DetachingFunction)
+    return PreservedAnalyses::all();
+
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  // auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  // auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+  // auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &ORE =
+    AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  // OptimizationRemarkEmitter ORE(F);
+
+  bool Changed = LoopSpawningImpl(F, LI, SE, DT, AC, ORE).run();
+
+  AM.invalidate<ScalarEvolutionAnalysis>(F);
+
+  if (Changed)
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+struct LoopSpawning : public FunctionPass {
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit LoopSpawning() : FunctionPass(ID) {
+    initializeLoopSpawningPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    bool DetachingFunction = false;
+    for (BasicBlock &BB : F)
+      if (isa<DetachInst>(BB.getTerminator()))
+        DetachingFunction = true;
+
+    if (!DetachingFunction)
+      return false;
+
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    // auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
+    // auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    // auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    // auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    // auto *AA = &getAnalysis<AAResultsWrapperPass>(*F).getAAResults();
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto &ORE =
+      getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    // OptimizationRemarkEmitter ORE(F);
+    return LoopSpawningImpl(F, LI, SE, DT, AC, ORE).run();
+  }
+
+  // bool runOnModule(Module &M) override {
+  //   if (skipModule(M))
+  //     return false;
+
+  //   // Find functions that detach for processing.
+  //   SmallVector<Function *, 4> WorkList;
+  //   for (Function &F : M)
+  //     for (BasicBlock &BB : F)
+  //       if (isa<DetachInst>(BB.getTerminator()))
+  //         WorkList.push_back(&F);
+
+  //   if (WorkList.empty())
+  //     return false;
+
+  //   auto GetLI = [this](Function &F) -> LoopInfo & {
+  //     return getAnalysis<LoopInfoWrapperPass>(F).getLoopInfo();
+  //   };
+  //   auto GetSE = [this](Function &F) -> ScalarEvolution & {
+  //     return getAnalysis<ScalarEvolutionWrapperPass>(F).getSE();
+  //   };
+  //   auto GetDT = [this](Function &F) -> DominatorTree & {
+  //     return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  //   };
+
+  //   bool Changed = false;
+  //   while (!WorkList.empty()) {
+  //     // Process the next function.
+  //     Function *F = WorkList.back();
+  //     // auto *LI = &getAnalysis<LoopInfoWrapperPass>(*F).getLoopInfo();
+  //     // auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>(*F).getSE();
+  //     // auto *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
+  //     // auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
+  //     // auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  //     // auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+  //     // auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  //     // auto *AA = &getAnalysis<AAResultsWrapperPass>(*F).getAAResults();
+  //     // auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F);
+  //     auto &ORE =
+  //       getAnalysis<OptimizationRemarkEmitterWrapperPass>(*F).getORE();
+  //     // OptimizationRemarkEmitter ORE(F);
+  //     // LoopSpawningImpl Impl(*F, GetLI, GetSE, GetDT, *TTI, TLI, *AA, *AC, ORE);
+  //     LoopSpawningImpl Impl(*F, GetLI, GetSE, GetDT, ORE);
+  //     Changed |= Impl.run();
+
+  //     WorkList.pop_back();
+  //   }
+  //   return Changed;
+  // }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    // AU.addRequired<LoopAccessLegacyAnalysis>();
+    // getAAResultsAnalysisUsage(AU);
+    // AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
+};
+}
+
+char LoopSpawning::ID = 0;
+// static RegisterPass<LoopSpawning> X(LS_NAME, "Transform Tapir loops to spawn iterations efficiently", false, false);
+static const char ls_name[] = "Loop Spawning";
+INITIALIZE_PASS_BEGIN(LoopSpawning, LS_NAME, ls_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+// INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+// INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopSpawning, LS_NAME, ls_name, false, false)
+
+namespace llvm {
+Pass *createLoopSpawningPass() {
+  return new LoopSpawning();
+}
+}
diff --git a/llvm/lib/Transforms/Tapir/LowerToCilk.cpp b/llvm/lib/Transforms/Tapir/LowerToCilk.cpp
new file mode 100644
index 00000000000000..2d8b1ccb82572e
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/LowerToCilk.cpp
@@ -0,0 +1,219 @@
+//===- LowerToCilk.cpp - Convert Tapir into Cilk runtime calls ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts functions that include Tapir instructions to call out to
+// the Cilk runtime system.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Tapir/CilkABI.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Tapir.h"
+
+#define DEBUG_TYPE "tapir2cilk"
+
+using namespace llvm;
+
+static cl::opt<bool> ClInstrumentCilk("instrument-cilk", cl::init(false),
+                                      cl::Hidden,
+                                      cl::desc("Instrument Cilk events"));
+
+cl::opt<bool> fastCilk("fast-cilk", cl::init(false), cl::Hidden,
+                       cl::desc("Attempt faster cilk call implementation"));
+
+namespace {
+
+struct LowerTapirToCilk : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  bool DisablePostOpts;
+  bool Instrument;
+  explicit LowerTapirToCilk(bool DisablePostOpts = false, bool Instrument = false)
+      : ModulePass(ID), DisablePostOpts(DisablePostOpts),
+        Instrument(Instrument) {
+    initializeLowerTapirToCilkPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Simple Lowering of Tapir to Cilk ABI";
+  }
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+private:
+  ValueToValueMapTy DetachCtxToStackFrame;
+  bool unifyReturns(Function &F);
+  SmallVectorImpl<Function *> *processFunction(Function &F, DominatorTree &DT,
+                                               AssumptionCache &AC);
+};
+}  // End of anonymous namespace
+
+char LowerTapirToCilk::ID = 0;
+INITIALIZE_PASS_BEGIN(LowerTapirToCilk, "tapir2cilk",
+                      "Simple Lowering of Tapir to Cilk ABI", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(LowerTapirToCilk, "tapir2cilk",
+                    "Simple Lowering of Tapir to Cilk ABI", false, false)
+
+// Helper function to inline calls to compiler-generated Cilk Plus runtime
+// functions when possible.  This inlining is necessary to properly implement
+// some Cilk runtime "calls," such as __cilkrts_detach().
+static inline void inlineCilkFunctions(Function &F) {
+  bool inlining = true;
+  while (inlining) {
+    inlining = false;
+    for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+      if (CallInst *cal = dyn_cast<CallInst>(&*I))
+        if (Function *fn = cal->getCalledFunction())
+          if (fn->getName().startswith("__cilk")) {
+            InlineFunctionInfo IFI;
+            if (InlineFunction(cal, IFI)) {
+              if (fn->getNumUses()==0)
+                fn->eraseFromParent();
+              inlining = true;
+              break;
+            }
+          }
+  }
+
+  if (verifyFunction(F, &errs())) {
+    DEBUG(F.dump());
+    assert(0);
+  }
+}
+
+bool LowerTapirToCilk::unifyReturns(Function &F) {
+  SmallVector<BasicBlock *, 4> ReturningBlocks;
+  for (BasicBlock &BB : F)
+    if (isa<ReturnInst>(BB.getTerminator()))
+      ReturningBlocks.push_back(&BB);
+
+  // If this function already has a single return, then terminate early.
+  if (ReturningBlocks.size() == 1)
+    return false;
+
+  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(),
+                                               "UnifiedReturnBlock", &F);
+  PHINode *PN = nullptr;
+  if (F.getReturnType()->isVoidTy()) {
+    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+                         "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  //
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+  return true;
+}
+
+SmallVectorImpl<Function *>
+*LowerTapirToCilk::processFunction(Function &F, DominatorTree &DT,
+                                   AssumptionCache &AC) {
+  if (fastCilk && F.getName()=="main") {
+    IRBuilder<> start(F.getEntryBlock().getFirstNonPHIOrDbg());
+    auto m = start.CreateCall(CILKRTS_FUNC(init, *F.getParent()));
+    m->moveBefore(F.getEntryBlock().getTerminator());
+  }
+
+  if (unifyReturns(F))
+    DT.recalculate(F);
+
+  // Lower Tapir instructions in this function.  Collect the set of helper
+  // functions generated by this process.
+  SmallVector<Function *, 4> *NewHelpers = new SmallVector<Function *, 4>();
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    if (DetachInst* DI = dyn_cast_or_null<DetachInst>(I->getTerminator())) {
+      // Lower a detach instruction, and collect the helper function generated
+      // in this process for executing the detached task.
+      Function *Helper = cilk::createDetach(*DI, DetachCtxToStackFrame, DT, AC,
+                                            ClInstrumentCilk || Instrument);
+      NewHelpers->push_back(Helper);
+    } else if (SyncInst* SI = dyn_cast_or_null<SyncInst>(I->getTerminator())) {
+      // Lower a sync instruction.
+      cilk::createSync(*SI, DetachCtxToStackFrame,
+                       ClInstrumentCilk || Instrument);
+    }
+  }
+
+  if (verifyFunction(F, &errs())) {
+    DEBUG(F.dump());
+    assert(0);
+  }
+
+  // Inline Cilk runtime calls in the function and generated helper functions.
+  inlineCilkFunctions(F);
+  for (Function *H : *NewHelpers)
+    inlineCilkFunctions(*H);
+
+  return NewHelpers;
+}
+
+bool LowerTapirToCilk::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  // Add functions that detach to the work list.
+  SmallVector<Function *, 4> WorkList;
+  for (Function &F : M)
+    for (BasicBlock &BB : F)
+      if (isa<DetachInst>(BB.getTerminator())) {
+        WorkList.push_back(&F);
+        break;
+      }
+
+  if (WorkList.empty())
+    return false;
+
+  bool Changed = false;
+  std::unique_ptr<SmallVectorImpl<Function *>> NewHelpers;
+  while (!WorkList.empty()) {
+    // Process the next function.
+    Function *F = WorkList.back();
+    WorkList.pop_back();
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
+    AssumptionCacheTracker &ACT = getAnalysis<AssumptionCacheTracker>();
+    NewHelpers.reset(processFunction(*F, DT, ACT.getAssumptionCache(*F)));
+    Changed |= !NewHelpers->empty();
+    // Check the generated helper functions to see if any need to be processed,
+    // that is, to see if any of them themselves detach a subtask.
+    for (Function *Helper : *NewHelpers)
+      for (BasicBlock &BB : *Helper)
+        if (isa<DetachInst>(BB.getTerminator()))
+          WorkList.push_back(Helper);
+  }
+  return Changed;
+}
+
+// createLowerTapirToCilkPass - Provide an entry point to create this pass.
+//
+namespace llvm {
+ModulePass *createLowerTapirToCilkPass(bool DisablePostOpts, bool Instrument) {
+  return new LowerTapirToCilk(DisablePostOpts, Instrument);
+}
+}
diff --git a/llvm/lib/Transforms/Tapir/Outline.cpp b/llvm/lib/Transforms/Tapir/Outline.cpp
new file mode 100644
index 00000000000000..ce347c4bf7fdf6
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/Outline.cpp
@@ -0,0 +1,379 @@
+//===- TapirOutline.cpp - Outlining for Tapir -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements helper functions for outlining portions of code
+// containing Tapir instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Tapir/Outline.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "outlining"
+
+/// definedInRegion - Return true if the specified value is defined in the
+/// extracted region.
+static bool definedInRegion(const SmallPtrSetImpl<BasicBlock *> &Blocks,
+                            Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
+
+/// definedInCaller - Return true if the specified value is defined in the
+/// function being code extracted, but not in the region being extracted.
+/// These values must be passed in as live-ins to the function.
+static bool definedInCaller(const SmallPtrSetImpl<BasicBlock *> &Blocks,
+                            Value *V) {
+  if (isa<Argument>(V)) return true;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (!Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
+
+void llvm::findInputsOutputs(const SmallPtrSetImpl<BasicBlock *> &Blocks,
+                             ValueSet &Inputs,
+                             ValueSet &Outputs,
+                             const SmallPtrSetImpl<BasicBlock *> *ExitBlocks) {
+  for (BasicBlock *BB : Blocks) {
+    // If a used value is defined outside the region, it's an input.  If an
+    // instruction is used outside the region, it's an output.
+    for (Instruction &II : *BB) {
+      for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE;
+           ++OI) {
+        // The PHI nodes in each exit block will be updated after the exit block
+        // is cloned.  Hence, we don't want to count their uses of values
+        // defined outside the region.
+        if (ExitBlocks->count(BB))
+          if (PHINode *PN = dyn_cast<PHINode>(&II))
+            if (!Blocks.count(PN->getIncomingBlock(*OI)))
+              continue;
+        if (definedInCaller(Blocks, *OI))
+          Inputs.insert(*OI);
+      }
+
+      for (User *U : II.users())
+        if (!definedInRegion(Blocks, U)) {
+          Outputs.insert(&II);
+          break;
+        }
+    }
+  }
+}
+
+// Clone Blocks into NewFunc, transforming the old arguments into references to
+// VMap values.
+//
+/// TODO: Fix the std::vector part of the type of this function.
+void llvm::CloneIntoFunction(Function *NewFunc, const Function *OldFunc,
+                             std::vector<BasicBlock *> Blocks,
+                             ValueToValueMapTy &VMap,
+                             bool ModuleLevelChanges,
+                             SmallVectorImpl<ReturnInst *> &Returns,
+                             const StringRef NameSuffix,
+                             SmallPtrSetImpl<BasicBlock *> *ExitBlocks,
+                             DISubprogram *SP,
+                             ClonedCodeInfo *CodeInfo,
+                             ValueMapTypeRemapper *TypeMapper,
+                             ValueMaterializer *Materializer) {
+  // Get the predecessors of the exit blocks
+  SmallPtrSet<const BasicBlock *, 4> ExitBlockPreds, ClonedEBPreds;
+  for (BasicBlock *EB : *ExitBlocks)
+    for (BasicBlock *Pred : predecessors(EB))
+      ExitBlockPreds.insert(Pred);
+
+  // When we remap instructions, we want to avoid duplicating inlined
+  // DISubprograms, so record all subprograms we find as we duplicate
+  // instructions and then freeze them in the MD map.
+  DebugInfoFinder DIFinder;
+
+  // Loop over all of the basic blocks in the function, cloning them as
+  // appropriate.
+  for (const BasicBlock *BB : Blocks) {
+    // Record all exit block predecessors that are cloned.
+    if (ExitBlockPreds.count(BB))
+      ClonedEBPreds.insert(BB);
+
+    // Create a new basic block and copy instructions into it!
+    BasicBlock *CBB = CloneBasicBlock(BB, VMap, NameSuffix, NewFunc, CodeInfo,
+                                      SP ? &DIFinder : nullptr);
+
+    // Add basic block mapping.
+    VMap[BB] = CBB;
+
+    // It is only legal to clone a function if a block address within that
+    // function is never referenced outside of the function.  Given that, we
+    // want to map block addresses from the old function to block addresses in
+    // the clone. (This is different from the generic ValueMapper
+    // implementation, which generates an invalid blockaddress when
+    // cloning a function.)
+    if (BB->hasAddressTaken()) {
+      Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+                                              const_cast<BasicBlock*>(BB));
+      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);
+    }
+
+    // Note return instructions for the caller.
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
+      Returns.push_back(RI);
+  }
+
+  // For each exit block, clean up its phi nodes to exclude predecessors that
+  // were not cloned.
+  if (ExitBlocks) {
+    for (BasicBlock *EB : *ExitBlocks) {
+      // Get the predecessors of this exit block that were not cloned.
+      SmallVector<BasicBlock *, 4> PredNotCloned;
+      for (BasicBlock *Pred : predecessors(EB))
+        if (!ClonedEBPreds.count(Pred))
+          PredNotCloned.push_back(Pred);
+
+      // Iterate over the phi nodes in the cloned exit block and remove incoming
+      // values from predecessors that were not cloned.
+      BasicBlock *ClonedEB = cast<BasicBlock>(VMap[EB]);
+      BasicBlock::iterator BI = ClonedEB->begin();
+      while (PHINode *PN = dyn_cast<PHINode>(BI)) {
+        for (BasicBlock *DeadPred : PredNotCloned)
+          if (PN->getBasicBlockIndex(DeadPred) > -1)
+            PN->removeIncomingValue(DeadPred);
+        ++BI;
+      }
+    }
+  }
+
+  // for (DISubprogram *ISP : DIFinder.subprograms()) {
+  //   if (ISP != SP) {
+  //     VMap.MD()[ISP].reset(ISP);
+  //   }
+  // }
+
+  // Loop over all of the instructions in the function, fixing up operand
+  // references as we go.  This uses VMap to do all the hard work.
+  for (const BasicBlock *BB : Blocks) {
+    BasicBlock *CBB = cast<BasicBlock>(VMap[BB]);
+    // Loop over all instructions, fixing each one as we find it...
+    for (Instruction &II : *CBB)
+      RemapInstruction(&II, VMap,
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                       TypeMapper, Materializer);
+  }
+}
+
+/// Create a helper function whose signature is based on Inputs and
+/// Outputs as follows: f(in0, ..., inN, out0, ..., outN)
+///
+/// TODO: Fix the std::vector part of the type of this function.
+Function *llvm::CreateHelper(const ValueSet &Inputs,
+                             const ValueSet &Outputs,
+                             std::vector<BasicBlock *> Blocks,
+                             BasicBlock *Header,
+                             const BasicBlock *OldEntry,
+                             const BasicBlock *OldExit,
+                             ValueToValueMapTy &VMap,
+                             Module *DestM,
+                             bool ModuleLevelChanges,
+                             SmallVectorImpl<ReturnInst *> &Returns,
+                             const StringRef NameSuffix,
+                             SmallPtrSetImpl<BasicBlock *> *ExitBlocks,
+                             const Instruction *InputSyncRegion,
+                             ClonedCodeInfo *CodeInfo,
+                             ValueMapTypeRemapper *TypeMapper,
+                             ValueMaterializer *Materializer) {
+  DEBUG(dbgs() << "inputs: " << Inputs.size() << "\n");
+  DEBUG(dbgs() << "outputs: " << Outputs.size() << "\n");
+
+  Function *OldFunc = Header->getParent();
+  Type *RetTy = Type::getVoidTy(Header->getContext());
+
+  std::vector<Type *> paramTy;
+
+  // Add the types of the input values to the function's argument list
+  for (Value *value : Inputs) {
+    DEBUG(dbgs() << "value used in func: " << *value << "\n");
+    paramTy.push_back(value->getType());
+  }
+
+  // Add the types of the output values to the function's argument list.
+  for (Value *output : Outputs) {
+    DEBUG(dbgs() << "instr used in func: " << *output << "\n");
+    paramTy.push_back(PointerType::getUnqual(output->getType()));
+  }
+
+  DEBUG({
+      dbgs() << "Function type: " << *RetTy << " f(";
+      for (Type *i : paramTy)
+	dbgs() << *i << ", ";
+      dbgs() << ")\n";
+    });
+
+  FunctionType *FTy = FunctionType::get(RetTy, paramTy, false);
+
+  // Create the new function
+  Function *NewFunc = Function::Create(FTy,
+				       GlobalValue::InternalLinkage,
+				       OldFunc->getName() + "_" +
+				       Header->getName() + NameSuffix, DestM);
+
+  // Set names for input and output arguments.
+  Function::arg_iterator DestI = NewFunc->arg_begin();
+  for (Value *I : Inputs)
+    if (VMap.count(I) == 0) {       // Is this argument preserved?
+      DestI->setName(I->getName()+NameSuffix); // Copy the name over...
+      VMap[I] = &*DestI++;          // Add mapping to VMap
+    }
+  for (Value *I : Outputs)
+    if (VMap.count(I) == 0) {              // Is this argument preserved?
+      DestI->setName(I->getName()+NameSuffix); // Copy the name over...
+      VMap[I] = &*DestI++;                 // Add mapping to VMap
+    }
+
+  // Copy all attributes other than those stored in the AttributeSet.  We need
+  // to remap the parameter indices of the AttributeSet.
+  AttributeList NewAttrs = NewFunc->getAttributes();
+  NewFunc->copyAttributesFrom(OldFunc);
+  NewFunc->setAttributes(NewAttrs);
+
+  // Fix up the personality function that got copied over.
+  if (OldFunc->hasPersonalityFn())
+    NewFunc->setPersonalityFn(
+        MapValue(OldFunc->getPersonalityFn(), VMap,
+                 ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                 TypeMapper, Materializer));
+
+  SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size());
+  AttributeList OldAttrs = OldFunc->getAttributes();
+
+  // Clone any argument attributes
+  for (Argument &OldArg : OldFunc->args()) {
+    // Check if we're passing this argument to the helper.  We check Inputs here
+    // instead of the VMap to avoid potentially populating the VMap with a null
+    // entry for the old argument.
+    if (Inputs.count(&OldArg) || Outputs.count(&OldArg)) {
+      Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg]);
+      NewArgAttrs[NewArg->getArgNo()] =
+          OldAttrs.getParamAttributes(OldArg.getArgNo());
+    }
+  }
+
+  // Ignore the return attributes of the old function.
+  NewFunc->setAttributes(
+      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
+                         AttributeSet(), NewArgAttrs));
+
+  // Clone the metadata from the old function into the new.
+  bool MustCloneSP =
+      OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
+  DISubprogram *SP = OldFunc->getSubprogram();
+  if (SP) {
+    assert(!MustCloneSP || ModuleLevelChanges);
+    // Add mappings for some DebugInfo nodes that we don't want duplicated
+    // even if they're distinct.
+    auto &MD = VMap.MD();
+    MD[SP->getUnit()].reset(SP->getUnit());
+    MD[SP->getType()].reset(SP->getType());
+    MD[SP->getFile()].reset(SP->getFile());
+    // If we're not cloning into the same module, no need to clone the
+    // subprogram
+    if (!MustCloneSP)
+      MD[SP].reset(SP);
+  }
+
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  OldFunc->getAllMetadata(MDs);
+  for (auto MD : MDs) {
+    NewFunc->addMetadata(
+        MD.first,
+        *MapMetadata(MD.second, VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                     TypeMapper, Materializer));
+  }
+
+  // We assume that the Helper reads and writes its arguments.  If the parent
+  // function had stronger attributes on memory access -- specifically, if the
+  // parent is marked as only reading memory -- we must replace this attribute
+  // with an appropriate weaker form.
+  if (OldFunc->onlyReadsMemory()) {
+    NewFunc->removeFnAttr(Attribute::ReadNone);
+    NewFunc->removeFnAttr(Attribute::ReadOnly);
+    NewFunc->setOnlyAccessesArgMemory();
+  }
+
+  // Inherit the calling convention from the parent.
+  NewFunc->setCallingConv(OldFunc->getCallingConv());
+
+  // The new function needs a root node because other nodes can branch to the
+  // head of the region, but the entry node of a function cannot have preds.
+  BasicBlock *NewEntry = BasicBlock::Create(Header->getContext(),
+					    OldEntry->getName()+NameSuffix,
+                                            NewFunc);
+  // The new function also needs an exit node.
+  BasicBlock *NewExit = BasicBlock::Create(Header->getContext(),
+					   OldExit->getName()+NameSuffix,
+                                           NewFunc);
+
+  // Add mappings to the NewEntry and NewExit.
+  VMap[OldEntry] = NewEntry;
+  VMap[OldExit] = NewExit;
+
+  // Create new sync region to replace the old one containing any cloned Tapir
+  // instructions, and add the appropriate mappings.
+  if (InputSyncRegion) {
+    Instruction *NewSR = InputSyncRegion->clone();
+    if (InputSyncRegion->hasName())
+      NewSR->setName(InputSyncRegion->getName()+NameSuffix);
+    NewEntry->getInstList().push_back(NewSR);
+    VMap[InputSyncRegion] = NewSR;
+  }
+
+  // Clone Blocks into the new function.
+  CloneIntoFunction(NewFunc, OldFunc, Blocks, VMap, ModuleLevelChanges,
+                    Returns, NameSuffix, ExitBlocks, SP, CodeInfo,
+                    TypeMapper, Materializer);
+
+  // Add a branch in the new function to the cloned Header.
+  BranchInst::Create(cast<BasicBlock>(VMap[Header]), NewEntry);
+  // Add a return in the new function.
+  ReturnInst::Create(Header->getContext(), NewExit);
+
+  return NewFunc;
+}
+
+// Add alignment assumptions to parameters of outlined function, based on known
+// alignment data in the caller.
+void llvm::AddAlignmentAssumptions(const Function *Caller,
+                                   const ValueSet &Inputs,
+                                   ValueToValueMapTy &VMap,
+                                   const Instruction *CallSite,
+                                   AssumptionCache *AC,
+                                   DominatorTree *DT) {
+  auto &DL = Caller->getParent()->getDataLayout();
+  for (Value *ArgVal : Inputs) {
+    // Ignore arguments to non-pointer types
+    if (!ArgVal->getType()->isPointerTy()) continue;
+    Argument *Arg = cast<Argument>(VMap[ArgVal]);
+    // Ignore arguments to non-pointer types
+    if (!Arg->getType()->isPointerTy()) continue;
+    // If the argument already has an alignment attribute, skip it.
+    if (Arg->getParamAlignment()) continue;
+    // Get any known alignment information for this argument's value.
+    unsigned Align = getKnownAlignment(ArgVal, DL, CallSite, AC, DT);
+    // If we have alignment data, add it as an attribute to the outlined
+    // function's parameter.
+    if (Align)
+      Arg->addAttr(Attribute::getWithAlignment(Arg->getContext(), Align));
+  }
+}
diff --git a/llvm/lib/Transforms/Tapir/RedundantSpawn.cpp b/llvm/lib/Transforms/Tapir/RedundantSpawn.cpp
new file mode 100644
index 00000000000000..8b9242b1424e4a
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/RedundantSpawn.cpp
@@ -0,0 +1,87 @@
+
+#include "llvm/Transforms/Tapir.h"
+
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/CFG.h"
+
+using namespace llvm;
+
+namespace {
+struct RedundantSpawn : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  RedundantSpawn() : FunctionPass(ID) {
+    //initializeRedundantSpawnPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    //AU.addRequired<TargetTransformInfoWrapperPass>();
+    //AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    F.setName("RedundantSpawn_"+F.getName());
+
+    bool effective = false;
+    do {
+      effective = false;
+      TerminatorInst* prior = nullptr;
+      BasicBlock* start = nullptr;
+      bool lookForDetach = false;
+      int rank = 0;
+      for (BasicBlock &BB: F) {
+        if (isa<ReattachInst>(BB.getTerminator()) && BB.size() == 1) {
+          lookForDetach = true;
+          start = &BB;
+          effective = true;
+          break;
+        }
+        if (prior != nullptr && isa<DetachInst>(prior))
+          rank +=1;
+        if (prior != nullptr && isa<ReattachInst>(prior))
+          rank -=1;
+        prior = BB.getTerminator();
+      }
+      if (lookForDetach) {
+        BasicBlock* current = start;
+        int currentRank = rank;
+        while (true) {
+          for (BasicBlock *Pred : predecessors(current)) {
+            current = Pred;
+            break;
+          }
+          if (isa<DetachInst>(current->getTerminator()) && currentRank == rank) {
+            BranchInst* replaceReattach = BranchInst::Create(start->getSingleSuccessor());
+            BranchInst* replaceDetach = BranchInst::Create(current->getTerminator()->getSuccessor(0));
+            ReplaceInstWithInst(start->getTerminator(), replaceReattach);
+            ReplaceInstWithInst(current->getTerminator(), replaceDetach);
+            break;
+          }
+          if (isa<DetachInst>(current->getTerminator()))
+            currentRank -= 1;
+          if (isa<ReattachInst>(current->getTerminator()))
+            currentRank += 1;
+        }
+      }
+    } while (effective);
+
+    return true;
+  }
+};
+}
+
+char RedundantSpawn::ID = 0;
+static RegisterPass<RedundantSpawn> X("redundantspawn", "Do RedundantSpawn pass", false, false);
+
+// Public interface to the RedundantSpawn pass
+FunctionPass *llvm::createRedundantSpawnPass() {
+  return new RedundantSpawn();
+}
diff --git a/llvm/lib/Transforms/Tapir/SmallBlock.cpp b/llvm/lib/Transforms/Tapir/SmallBlock.cpp
new file mode 100644
index 00000000000000..c46e90baeb620a
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/SmallBlock.cpp
@@ -0,0 +1,68 @@
+
+#include "llvm/Transforms/Tapir.h"
+
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+namespace {
+struct SmallBlock : public FunctionPass {
+  static const int threshold = 10;
+  static char ID; // Pass identification, replacement for typeid
+  SmallBlock() : FunctionPass(ID) {
+    //initializeSmallBlockPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    //AU.addRequired<TargetTransformInfoWrapperPass>();
+    //AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    F.setName("SmallBlock_"+F.getName());
+
+    BasicBlock* b = nullptr;
+    BasicBlock* prior = nullptr;
+    bool effective;
+    int count = 0;
+    do {
+      effective = false;
+      for (BasicBlock &BB: F) {
+        count += BB.size();
+        if (isa<DetachInst>(BB.getTerminator())) {
+          b = &BB;
+          count = 0;
+        }
+        if (isa<ReattachInst>(BB.getTerminator()) && count < threshold && prior != b) {
+          // b ensured to be the corresponding reattach
+          effective = true;
+          prior = b;
+          BranchInst* replaceReattach = BranchInst::Create(BB.getSingleSuccessor());
+          BranchInst* replaceDetach = BranchInst::Create(b->getTerminator()->getSuccessor(0));
+          ReplaceInstWithInst(BB.getTerminator(), replaceReattach);
+          ReplaceInstWithInst(b->getTerminator(), replaceDetach);
+        }
+      }
+    } while (effective);
+
+    return true;
+  }
+};
+}
+
+char SmallBlock::ID = 0;
+static RegisterPass<SmallBlock> X("smallblock", "Do SmallBlock pass", false, false);
+
+// Public interface to the SmallBlock pass
+FunctionPass *llvm::createSmallBlockPass() {
+  return new SmallBlock();
+}
diff --git a/llvm/lib/Transforms/Tapir/SpawnRestructure.cpp b/llvm/lib/Transforms/Tapir/SpawnRestructure.cpp
new file mode 100644
index 00000000000000..2b0b15ca1900a6
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/SpawnRestructure.cpp
@@ -0,0 +1,48 @@
+
+#include "llvm/Transforms/Tapir.h"
+
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/CFG.h"
+
+using namespace llvm;
+
+namespace {
+struct SpawnRestructure : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  SpawnRestructure() : FunctionPass(ID) {
+    //initializeSpawnRestructurePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    //AU.addRequired<TargetTransformInfoWrapperPass>();
+    //AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    F.setName("SpawnRestructure_"+F.getName());
+
+    for (BasicBlock &BB: F) {
+
+    }
+
+    return true;
+  }
+};
+}
+
+char SpawnRestructure::ID = 0;
+static RegisterPass<SpawnRestructure> X("spawnrestructure", "Do SpawnRestructure pass", false, false);
+
+// Public interface to the RedundantSpawn pass
+FunctionPass *llvm::createSpawnRestructurePass() {
+  return new SpawnRestructure();
+}
diff --git a/llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp b/llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp
new file mode 100644
index 00000000000000..9206c90b987393
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp
@@ -0,0 +1,96 @@
+
+#include "llvm/Transforms/Tapir.h"
+
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/CFG.h"
+
+using namespace llvm;
+
+namespace {
+struct SpawnUnswitch : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  SpawnUnswitch() : FunctionPass(ID) {
+    //initializeSpawnUnswitchPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    //AU.addRequired<TargetTransformInfoWrapperPass>();
+    //AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    F.setName("SpawnUnswitch_"+F.getName());
+
+
+    bool effective;
+    do {
+      effective = false;
+      BasicBlock* body = nullptr;
+      BasicBlock* end = nullptr;
+
+      for (BasicBlock &BB: F) {
+        if (BB.size() == 1 && isa<ReattachInst>(BB.getTerminator())) {
+          end = BB.getSingleSuccessor();
+          int count = 0;
+          for (BasicBlock *Pred : predecessors(&BB)) {
+            for (BasicBlock *PredPred : predecessors(Pred)) {
+              if (!isa<DetachInst>(PredPred->getTerminator())) {
+                body = Pred;
+              }
+            }
+            count++;
+          }
+          if (count == 2) { // only predecessors are det.achd and if.then
+            for (BasicBlock *Pred : predecessors(&BB)) {
+              if (Pred->size() == 2 && isa<BranchInst>(Pred->getTerminator())) { // if clause only compares register contents
+                Instruction* cmp = nullptr;
+                for (Instruction &I : *Pred) {
+                  cmp = &I;
+                  break;
+                }
+                for (BasicBlock *PredPred : predecessors(Pred)) {
+                  if (DetachInst *DI = dyn_cast<DetachInst>(PredPred->getTerminator())) { // outer spawn
+                    Value *SyncRegion = DI->getSyncRegion();
+                    effective = true;
+                    // move cmp instruction to outside spawn
+                    Instruction *pi = PredPred->getTerminator();
+                    cmp->moveBefore(pi);
+
+                    // branch now to detach or end
+                    TerminatorInst* temp = Pred->getTerminator();
+                    BranchInst* replaceDetach = BranchInst::Create(Pred, end, ((BranchInst*)temp)->getCondition());
+                    ReplaceInstWithInst(PredPred->getTerminator(), replaceDetach);
+
+                    // detach now goes straight to body
+                    DetachInst* newDetach = DetachInst::Create(body, end, SyncRegion);
+                    ReplaceInstWithInst(Pred->getTerminator(), newDetach);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    } while (effective);
+
+    return true;
+  }
+};
+}
+
+char SpawnUnswitch::ID = 0;
+static RegisterPass<SpawnUnswitch> X("spawnunswitch", "Do SpawnUnswitch pass", false, false);
+
+// Public interface to the RedundantSpawn pass
+FunctionPass *llvm::createSpawnUnswitchPass() {
+  return new SpawnUnswitch();
+}
diff --git a/llvm/lib/Transforms/Tapir/SyncElimination.cpp b/llvm/lib/Transforms/Tapir/SyncElimination.cpp
new file mode 100644
index 00000000000000..62301069348471
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/SyncElimination.cpp
@@ -0,0 +1,273 @@
+//===- SyncElimination.cpp - Eliminate unnecessary sync calls ----------------===//
+
+#include "llvm/Transforms/Tapir.h"
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/ADT/SmallSet.h"
+
+#include <deque>
+#include <map>
+
+using namespace llvm;
+
+namespace {
+
+typedef SmallSet<const BasicBlock *, 32> BasicBlockSet;
+typedef std::deque<const BasicBlock *> BasicBlockDeque;
+
+struct SyncElimination : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  SyncElimination() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    errs() << "SyncElimination: Found function: " << F.getName() << "\n";
+
+    bool ChangedAny = false;
+
+    while (true) {
+      bool Changed = false;
+
+      for (BasicBlock &block: F) {
+        if (isa<SyncInst>(block.getTerminator())) {
+          if (processSyncInstBlock(block)) {
+            Changed = true;
+            ChangedAny = true;
+            break;
+          }
+        }
+      }
+
+      if (!Changed) {
+        break;
+      }
+    }
+
+    return ChangedAny;
+  }
+
+private:
+
+  // We will explain what Rosetta and Vegas are later. Or rename them.
+  // We promise.
+
+  // Rosetta-finding code
+
+  void findRosetta(const BasicBlock &BB, BasicBlockSet &OutputSet) {
+    assert(isa<SyncInst>(BB.getTerminator()));
+
+    BasicBlockSet Visited;
+    BasicBlockDeque Frontier;
+    std::map<const BasicBlock *, int> DetachLevel;
+
+    DetachLevel[&BB] = 0;
+    Frontier.push_back(&BB);
+    OutputSet.insert(&BB);
+
+    while (!Frontier.empty()) {
+      const BasicBlock *Current = Frontier.front();
+      Frontier.pop_front();
+
+      for (const BasicBlock *Pred: predecessors(Current)) {
+        // TODO@jiahao: Investigate potential issues with continue edges here.
+
+        if (Visited.count(Pred) > 0) {
+          continue;
+        }
+
+        if (isa<SyncInst>(Pred->getTerminator())) {
+          continue;
+        }
+
+        Visited.insert(Pred);
+
+        DetachLevel[Pred] = DetachLevel[Current];
+
+        if (isa<ReattachInst>(Pred->getTerminator())) {
+          DetachLevel[Pred] ++;
+        } else if (isa<DetachInst>(Pred->getTerminator())) {
+          DetachLevel[Pred] --;
+        }
+
+        if (DetachLevel[Pred] > 0) {
+          OutputSet.insert(Pred);
+        }
+
+        if (DetachLevel[Pred] >= 0) {
+          Frontier.push_back(Pred);
+        }
+      }
+    }
+  }
+
+  // Vegas-finding code
+  //
+  // We run BFS starting from the sync block, following all foward edges, and stop a branch whenever
+  // we hit another sync block.
+
+  void findVegas(const BasicBlock &BB, BasicBlockSet &OutputSet) {
+    assert(isa<SyncInst>(BB.getTerminator()));
+
+    BasicBlockSet Visited;
+    BasicBlockDeque Frontier;
+
+    Frontier.push_back(&BB);
+
+    while (!Frontier.empty()) {
+      const BasicBlock *Current = Frontier.front();
+      Frontier.pop_front();
+
+      for (const BasicBlock *Succ: successors(Current)) {
+        if (Visited.count(Succ) > 0) {
+          continue;
+        }
+
+        Visited.insert(Succ);
+        OutputSet.insert(Succ);
+
+        // We need to include blocks whose terminator is another sync.
+        // Therefore we still insert the block into OutputSet in this case.
+        // However we do not search any further past the sync block.
+        if (!isa<SyncInst>(Succ->getTerminator())) {
+          Frontier.push_back(Succ);
+        }
+      }
+    }
+  }
+
+  bool willMod(const ModRefInfo &Info) {
+    return (Info == MRI_Mod || Info == MRI_ModRef);
+  }
+
+  bool instTouchesMemory(const Instruction &Inst) {
+    return Inst.getOpcode() == Instruction::Load ||
+           Inst.getOpcode() == Instruction::Store ||
+           Inst.getOpcode() == Instruction::VAArg ||
+           Inst.getOpcode() == Instruction::AtomicCmpXchg ||
+           Inst.getOpcode() == Instruction::AtomicRMW;
+  }
+
+  // FIXME: we can do better
+  void checkBlowUp(const Instruction &Inst) {
+    if (isa<FenceInst>(Inst)) {
+      errs() << Inst << "\n";
+      llvm_unreachable("BOOOOOOOOOOOOOOOOOOOOOOOOM! not supported (yet)");
+    }
+  }
+
+  bool isSyncEliminationLegal(const BasicBlockSet &RosettaSet, const BasicBlockSet &VegasSet) {
+    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+    for (const BasicBlock *RBB : RosettaSet) {
+      for (const Instruction &RI : *RBB) {
+        checkBlowUp(RI);
+
+        if (RI.getOpcode() == Instruction::Sync) {
+          continue;
+        }
+
+        for (const BasicBlock *VBB : VegasSet) {
+          for (const Instruction &VI : *VBB) {
+            checkBlowUp(VI);
+
+            if (VI.getOpcode() == Instruction::Sync) {
+              continue;
+            }
+
+            ImmutableCallSite RC(&RI), VC(&VI);
+
+            if (!!RC) {
+              // If RI is a call/invoke
+              if (instTouchesMemory(VI) &&
+                  AA->getModRefInfo(const_cast<Instruction *>(&VI), RC) != MRI_NoModRef) {
+                errs() << "SyncElimination:     Conflict found between " << RI << " and " << VI << "\n";
+                return false;
+              }
+            } else if (!!VC) {
+              // If VI is a call/invoke
+              if (instTouchesMemory(RI) &&
+                  AA->getModRefInfo(const_cast<Instruction *>(&RI), VC) != MRI_NoModRef) {
+                errs() << "SyncElimination:     Conflict found between " << RI << " and " << VI << "\n";
+                return false;
+              }
+            } else {
+              if (!instTouchesMemory(VI) || !instTouchesMemory(RI)) {
+                continue;
+              }
+
+              // If neither instruction is a call/invoke
+              MemoryLocation VML = MemoryLocation::get(&VI);
+              MemoryLocation RML = MemoryLocation::get(&RI);
+
+              if (AA->alias(RML, VML) && (willMod(AA->getModRefInfo(&RI, RML)) || willMod(AA->getModRefInfo(&VI, VML)))) {
+                // If the two memory location can potentially be aliasing each other, and
+                // at least one instruction modifies its memory location.
+                errs() << "SyncElimination:     Conflict found between " << RI << " and " << VI << "\n";
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
+  bool processSyncInstBlock(BasicBlock &BB) {
+    errs() << "SyncElimination: Found sync block: " << BB.getName() << "\n";
+
+    BasicBlockSet RosettaSet, VegasSet;
+
+    findRosetta(BB, RosettaSet);
+    findVegas(BB, VegasSet);
+
+    errs() << "SyncElimination:     Blocks found in the Rosetta set: " << "\n";
+    for (const BasicBlock *BB: RosettaSet) {
+      errs() << "SyncElimination:         " + BB->getName() << "\n";
+    }
+
+    errs() << "SyncElimination:     Blocks found in the Vegas set: " << "\n";
+    for (const BasicBlock *BB: VegasSet) {
+      errs() << "SyncElimination:         " + BB->getName() << "\n";
+    }
+
+    if (isSyncEliminationLegal(RosettaSet, VegasSet)) {
+      SyncInst *Sync = dyn_cast<SyncInst>(BB.getTerminator());
+      assert(Sync != NULL);
+      BasicBlock* suc = Sync->getSuccessor(0);
+      IRBuilder<> Builder(Sync);
+      Builder.CreateBr(suc);
+      Sync->eraseFromParent();
+      errs() << "SyncElimination:     A sync is removed. " << "\n";
+      return true;
+    }
+
+    return false;
+  }
+};
+
+}
+
+char SyncElimination::ID = 0;
+static RegisterPass<SyncElimination> X("sync-elimination", "Do sync-elimination's pass", false, false);
+
+// Public interface to the SyncElimination pass
+FunctionPass *llvm::createSyncEliminationPass() {
+  return new SyncElimination();
+}
diff --git a/llvm/lib/Transforms/Tapir/Tapir.cpp b/llvm/lib/Transforms/Tapir/Tapir.cpp
new file mode 100644
index 00000000000000..50813076c64b10
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/Tapir.cpp
@@ -0,0 +1,43 @@
+//===-- Tapir.cpp ---------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMTapirOpts.a, which
+// implements several transformations over the Tapir/LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Tapir.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Tapir.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+
+using namespace llvm;
+
+/// initializeTapirOpts - Initialize all passes linked into the
+/// TapirOpts library.
+void llvm::initializeTapirOpts(PassRegistry &Registry) {
+  initializeLoopSpawningPass(Registry);
+  initializeLowerTapirToCilkPass(Registry);
+}
+
+void LLVMInitializeTapirOpts(LLVMPassRegistryRef R) {
+  initializeTapirOpts(*unwrap(R));
+}
+
+void LLVMAddLoopSpawningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopSpawningPass());
+}
+
+void LLVMAddLowerTapirToCilkPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerTapirToCilkPass());
+}
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 7da768252fc198..2402e4b99779c2 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -149,6 +149,18 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   // Don't break unwinding instructions.
   if (PredBB->getTerminator()->isExceptionalTerminator())
     return false;
+  // For now, don't break syncs.
+  // TODO: Don't break syncs unless they don't sync anything.
+  if (isa<SyncInst>(PredBB->getTerminator())) return false;
+  // Don't break entry blocks of detached CFG's.
+  for (pred_iterator PI = pred_begin(PredBB), PE = pred_end(PredBB);
+       PI != PE; ++PI) {
+    BasicBlock *PredPredBB = *PI;
+    if (const DetachInst *DI =
+        dyn_cast<DetachInst>(PredPredBB->getTerminator()))
+      if (DI->getDetached() == PredBB)
+        return false;
+  }
 
   // Can't merge if there are multiple distinct successors.
   if (PredBB->getUniqueSuccessor() != BB)
@@ -301,7 +313,18 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
   // block.
   assert(BB->getTerminator()->getNumSuccessors() == 1 &&
          "Should have a single succ!");
-  return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU);
+  // return SplitBlock(BB, BB->getTerminator(), DT, LI);
+  BasicBlock *NewBB = SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU);
+  if (SyncInst *OldSI = dyn_cast<SyncInst>(NewBB->getTerminator())) {
+    // Make sure the original BB is terminated by the sync.
+    SyncInst *SI = SyncInst::Create(NewBB, OldSI->getSyncRegion(),
+                                    BB->getTerminator());
+    BranchInst::Create(Succ, OldSI);
+    SI->setDebugLoc(OldSI->getDebugLoc());
+    BB->getTerminator()->eraseFromParent();
+    OldSI->eraseFromParent();
+  }
+  return NewBB;
 }
 
 unsigned
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index fafc9aaba5c9cc..befb2ed13587e9 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -137,10 +137,27 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
 
   assert(!isa<IndirectBrInst>(TI) &&
          "Cannot split critical edge from IndirectBrInst");
+  assert(!isa<ReattachInst>(TI) &&
+         "Cannot split critical edge from ReattachInst");
+
+  bool SplittingDetachContinue = isa<DetachInst>(TI) && (1 == SuccNum);
+  if (SplittingDetachContinue)
+    assert((Options.SplitDetachContinue && Options.DT) &&
+           "Cannot split critical continuation edge from a detach");
 
   BasicBlock *TIBB = TI->getParent();
   BasicBlock *DestBB = TI->getSuccessor(SuccNum);
 
+  // If we're splitting a detach-continue edge, get the associated reattaches.
+  SmallVector<BasicBlock *, 1> Reattaches;
+  if (SplittingDetachContinue) {
+    BasicBlockEdge DetachEdge(TIBB, TI->getSuccessor(0));
+    for (BasicBlock *Pred : predecessors(DestBB))
+      if (isa<ReattachInst>(Pred->getTerminator()))
+        if (Options.DT->dominates(DetachEdge, Pred))
+          Reattaches.push_back(Pred);
+  }
+
   // Splitting the critical edge to a pad block is non-trivial. Don't do
   // it in this generic function.
   if (DestBB->isEHPad()) return nullptr;
@@ -155,6 +172,12 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
   // Branch to the new block, breaking the edge.
   TI->setSuccessor(SuccNum, NewBB);
 
+  // If we're splitting a detach-continue edge, redirect all appropriate
+  // reattach edges to branch to the new block
+  if (SplittingDetachContinue)
+    for (BasicBlock *RBB : Reattaches)
+      RBB->getTerminator()->setSuccessor(0, NewBB);
+
   // Insert the block into the function... right after the block TI lives in.
   Function &F = *TIBB->getParent();
   Function::iterator FBBI = TIBB->getIterator();
@@ -179,6 +202,28 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
         BBIdx = PN->getBasicBlockIndex(TIBB);
       PN->setIncomingBlock(BBIdx, NewBB);
     }
+
+    // Update the PHI node entries for the reattach predecessors as well.
+    if (SplittingDetachContinue) {
+      for (BasicBlock *RBB : Reattaches) {
+        unsigned BBIdx = 0;
+        for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+          // We no longer enter through RBB, now we come in through NewBB.
+          // Revector exactly one entry in the PHI node that used to come from
+          // TIBB to come from NewBB.
+          PHINode *PN = cast<PHINode>(I);
+
+          // Reuse the previous value of BBIdx if it lines up.  In cases where we
+          // have multiple phi nodes with *lots* of predecessors, this is a speed
+          // win because we don't have to scan the PHI looking for TIBB.  This
+          // happens because the BB list of PHI nodes are usually in the same
+          // order.
+          if (PN->getIncomingBlock(BBIdx) != RBB)
+            BBIdx = PN->getBasicBlockIndex(RBB);
+          PN->removeIncomingValue(BBIdx);
+        }
+      }
+    }
   }
 
   // If there are any other edges from TIBB to DestBB, update those to go
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index cb3dc17c03ad8d..e89b1d3c221cc2 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_library(LLVMTransformUtils
   SplitModule.cpp
   StripNonLineTableDebugInfo.cpp
   SymbolRewriter.cpp
+  TapirUtils.cpp
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 623fe91a5a6094..42ad327ab195e9 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -62,6 +62,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -1623,6 +1625,18 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         !isa<ConstantTokenNone>(CallSiteUnwindDestToken);
   }
 
+  // Get the entry block of the detached context into which we're inlining.  If
+  // we move allocas from the inlined code, we must move them to this block.
+  BasicBlock *DetachedCtxEntryBlock;
+  {
+    BasicBlock *CallingBlock = TheCall->getParent();
+    DetachedCtxEntryBlock = GetDetachedCtx(CallingBlock);
+    assert(((&(CallingBlock->getParent()->getEntryBlock()) ==
+             DetachedCtxEntryBlock) ||
+            DetachedCtxEntryBlock->getSinglePredecessor()) &&
+           "Entry block of detached context has multiple predecessors.");
+  }
+
   // Get an iterator to the last basic block in the function, which will have
   // the new function inlined after it.
   Function::iterator LastBlock = --Caller->end();
@@ -1781,7 +1795,8 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // calculate which instruction they should be inserted before.  We insert the
   // instructions at the end of the current alloca list.
   {
-    BasicBlock::iterator InsertPoint = Caller->begin()->begin();
+    // BasicBlock::iterator InsertPoint = Caller->begin()->begin();
+    BasicBlock::iterator InsertPoint = DetachedCtxEntryBlock->begin();
     for (BasicBlock::iterator I = FirstNewBlock->begin(),
          E = FirstNewBlock->end(); I != E; ) {
       AllocaInst *AI = dyn_cast<AllocaInst>(I++);
@@ -1811,7 +1826,9 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       // Transfer all of the allocas over in a block.  Using splice means
       // that the instructions aren't removed from the symbol table, then
       // reinserted.
-      Caller->getEntryBlock().getInstList().splice(
+      // Caller->getEntryBlock().getInstList().splice(
+      //     InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);
+      DetachedCtxEntryBlock->getInstList().splice(
           InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);
     }
     // Move any dbg.declares describing the allocas into the entry basic block.
@@ -1819,6 +1836,23 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     for (auto &AI : IFI.StaticAllocas)
       replaceDbgDeclareForAlloca(AI, AI, DIB, DIExpression::NoDeref, 0,
                                  DIExpression::NoDeref);
+
+    // Move any syncregion_start's into the entry basic block.
+    for (BasicBlock::iterator I = FirstNewBlock->begin(),
+         E = FirstNewBlock->end(); I != E; ) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I++);
+      if (!II) continue;
+      if (Intrinsic::syncregion_start != II->getIntrinsicID())
+        continue;
+
+      while (isa<IntrinsicInst>(I) &&
+             Intrinsic::syncregion_start ==
+             cast<IntrinsicInst>(I)->getIntrinsicID())
+        ++I;
+
+      DetachedCtxEntryBlock->getInstList().splice(
+          InsertPoint, FirstNewBlock->getInstList(), II->getIterator(), I);
+    }
   }
 
   SmallVector<Value*,4> VarArgsToForward;
@@ -2224,6 +2258,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // this is an invoke instruction or a call instruction.
   BasicBlock *AfterCallBB;
   BranchInst *CreatedBranchToNormalDest = nullptr;
+
   if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
 
     // Add an unconditional branch to make this look like the CallInst case...
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 380f4fca54d9ed..a9ac90d6e391da 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -522,6 +522,12 @@ static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
     if (Preheader)
       Changed = true;
   }
+  // Ensure that the preheader is not terminated by a sync.
+  if (Preheader && isa<SyncInst>(Preheader->getTerminator())) {
+    DEBUG(dbgs() << "LoopSimplify: Splitting sync-terminated preheader.\n");
+    SplitEdge(Preheader, L->getHeader(), DT, LI);
+    Preheader = L->getLoopPreheader();
+  }
 
   // Next, check to make sure that all exit nodes of the loop only have
   // predecessors that are inside of the loop.  This check guarantees that the
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index da7ed2bd165268..f3feb40ac97e08 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -154,6 +154,15 @@ BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
   return OnlyPred;
 }
 
+//! Identify if a loop could be a cilk for loop and thus diasble unrolling
+bool isCilkFor(Loop* L) {
+  //TODO use a more precise detection of cilk for loops
+  for (BasicBlock* BB : L->blocks())
+    if (dyn_cast<DetachInst>(BB->getTerminator()))
+      return true;
+  return false;
+}
+
 /// Check if unrolling created a situation where we need to insert phi nodes to
 /// preserve LCSSA form.
 /// \param Blocks is a vector of basic blocks representing unrolled loop.
@@ -411,6 +420,7 @@ LoopUnrollResult llvm::UnrollLoop(
 
   // Are we eliminating the loop control altogether?
   bool CompletelyUnroll = Count == TripCount;
+  if (isCilkFor(L) && !CompletelyUnroll) return false;
   SmallVector<BasicBlock *, 4> ExitBlocks;
   L->getExitBlocks(ExitBlocks);
   std::vector<BasicBlock*> OriginalLoopBlocks = L->getBlocks();
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 112e80d27e345d..240e92b81d1873 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -613,6 +613,67 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
   }
 }
 
+/// Returns true if the instruction in a loop is guaranteed to execute at least
+/// once.
+bool llvm::isGuaranteedToExecute(const Instruction &Inst,
+                                 const DominatorTree *DT, const Loop *CurLoop,
+                                 const LoopSafetyInfo *SafetyInfo) {
+  // We have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    // If there's a throw in the header block, we can't guarantee we'll reach
+    // Inst.
+    return !SafetyInfo->HeaderMayThrow;
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (SafetyInfo->MayThrow)
+    return false;
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // Verify that the block dominates each of the exit blocks of the loop.
+  for (unsigned i=0,e=ExitBlocks.size(); i<e; i++)
+    if (!DT->dominates(Inst.getParent(), ExitBlocks[i])) {
+      bool valid = false;
+      for( BasicBlock* b : CurLoop->getBlocks() ) {
+        if( auto RE = dyn_cast<ReattachInst>(b->getTerminator()) ) {
+          if( b == Inst.getParent() || DT->dominates(Inst.getParent(), b) ) {
+            bool tv = true;
+            for(unsigned i2=0; i2!=e; ++i2){
+              if( !DT->dominates( RE->getSuccessor(0), ExitBlocks[i2] ) )  {
+                tv = false; break;
+              }
+            }
+            if( tv ) {
+              valid = true;
+              break;
+            }
+          }
+        }
+      }
+      if (valid) continue;
+      return false;
+    }
+
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
+  // FIXME: In general, we have to prove that the loop isn't an infinite loop.
+  // See http::llvm.org/PR24078 .  (The "ExitBlocks.empty()" check above is
+  // just a special case of this.)
+  return true;
+}
+
 Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   // Only support loops with a unique exiting block, and a latch.
   if (!L->getExitingBlock())
diff --git a/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/llvm/lib/Transforms/Utils/Mem2Reg.cpp
index 23145e5847512a..269d9a18d12efa 100644
--- a/llvm/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/llvm/lib/Transforms/Utils/Mem2Reg.cpp
@@ -35,18 +35,33 @@ STATISTIC(NumPromoted, "Number of alloca's promoted");
 static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
                                     AssumptionCache &AC) {
   std::vector<AllocaInst *> Allocas;
-  BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
   bool Changed = false;
 
+  // Scan the function to get its entry block and all entry blocks of detached
+  // CFG's.  We can perform this scan for entry blocks once for the function,
+  // because this pass preserves the CFG.
+  SmallVector<BasicBlock *, 4> EntryBlocks;
+  bool FunctionContainsDetach = false;
+  EntryBlocks.push_back(&F.getEntryBlock());
+  for (BasicBlock &BB : F)
+    if (BasicBlock *Pred = BB.getUniquePredecessor())
+      if (DetachInst *DI = dyn_cast<DetachInst>(Pred->getTerminator())) {
+        FunctionContainsDetach = true;
+        if (DI->getDetached() == &BB)
+          EntryBlocks.push_back(&BB);
+      }
+
   while (true) {
     Allocas.clear();
 
     // Find allocas that are safe to promote, by looking at all instructions in
     // the entry node
-    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca?
-        if (isAllocaPromotable(AI))
-          Allocas.push_back(AI);
+    for (BasicBlock *BB : EntryBlocks)
+      for (BasicBlock::iterator I = BB->begin(), E = --BB->end(); I != E; ++I)
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+          if (isAllocaPromotable(AI) &&
+              (!FunctionContainsDetach || isAllocaParallelPromotable(AI, DT)))
+            Allocas.push_back(AI);
 
     if (Allocas.empty())
       break;
diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index ae5e72ea4d30f3..87aafa83ecfcab 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -127,6 +127,24 @@ void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
   appendToUsedList(M, "llvm.compiler.used", Values);
 }
 
+Function *llvm::checkCsiInterfaceFunction(Constant *FuncOrBitcast) {
+  if (Function *F = dyn_cast<Function>(FuncOrBitcast)) {
+    return F;
+  }
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FuncOrBitcast)) {
+    if (CE->isCast() && CE->getOpcode() == Instruction::BitCast) {
+      if (Function *F = dyn_cast<Function>(CE->getOperand(0))) {
+        return F;
+      }
+    }
+  }
+  FuncOrBitcast->print(errs());
+  std::string Err;
+  raw_string_ostream Stream(Err);
+  Stream << "ComprehensiveStaticInstrumentation interface function redefined: " << *FuncOrBitcast;
+  report_fatal_error(Err);
+}
+
 Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {
   if (isa<Function>(FuncOrBitcast))
     return cast<Function>(FuncOrBitcast);
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 91e4f4254b3e76..7e87fce8edf218 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -61,6 +61,7 @@ STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block");
 STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
 STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
 STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
+STATISTIC(NumAllocaWithDetachedUses,  "Number of alloca's with detached uses");
 
 bool llvm::isAllocaPromotable(const AllocaInst *AI) {
   // FIXME: If the memory unit is of pointer or integer type, we can permit
@@ -143,13 +144,12 @@ struct AllocaInfo {
         DefiningBlocks.push_back(SI->getParent());
         AllocaPointerVal = SI->getOperand(0);
         OnlyStore = SI;
-      } else {
-        LoadInst *LI = cast<LoadInst>(User);
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
         // Otherwise it must be a load instruction, keep track of variable
         // reads.
         UsingBlocks.push_back(LI->getParent());
         AllocaPointerVal = LI;
-      }
+      } else continue;
 
       if (OnlyUsedInOneBlock) {
         if (!OnlyBlock)
@@ -556,10 +556,18 @@ void PromoteMem2Reg::run() {
   LargeBlockInfo LBI;
   ForwardIDFCalculator IDF(DT);
 
+  bool FunctionContainsDetach = false;
+  {
+    for (BasicBlock &BB : F)
+      FunctionContainsDetach |= isa<DetachInst>(BB.getTerminator());
+  }
+
   for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
     AllocaInst *AI = Allocas[AllocaNum];
 
     assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
+    assert((!FunctionContainsDetach || isAllocaParallelPromotable(AI, DT)) &&
+           "Cannot promote non-promotable alloca in function with detach!");
     assert(AI->getParent()->getParent() == &F &&
            "All allocas should be in the same function, which is same as DF!");
 
@@ -607,17 +615,8 @@ void PromoteMem2Reg::run() {
         BBNumbers[&BB] = ID++;
     }
 
-    // Remember the dbg.declare intrinsic describing this alloca, if any.
-    if (!Info.DbgDeclares.empty())
-      AllocaDbgDeclares[AllocaNum] = Info.DbgDeclares;
-
-    // Keep the reverse mapping of the 'Allocas' array for the rename pass.
-    AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
-
-    // At this point, we're committed to promoting the alloca using IDF's, and
-    // the standard SSA construction algorithm.  Determine which blocks need PHI
-    // nodes and see if we can optimize out some work by avoiding insertion of
-    // dead phi nodes.
+    // Determine which blocks need PHI nodes and see if we can optimize out some
+    // work by avoiding insertion of dead phi nodes.
 
     // Unique the set of defining blocks for efficient lookup.
     SmallPtrSet<BasicBlock *, 32> DefBlocks;
@@ -628,14 +627,44 @@ void PromoteMem2Reg::run() {
     SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
     ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
 
-    // At this point, we're committed to promoting the alloca using IDF's, and
-    // the standard SSA construction algorithm.  Determine which blocks need phi
-    // nodes and see if we can optimize out some work by avoiding insertion of
-    // dead phi nodes.
+    // Determine which blocks need PHI nodes and see if we can optimize out some
+    // work by avoiding insertion of dead phi nodes.
     IDF.setLiveInBlocks(LiveInBlocks);
     IDF.setDefiningBlocks(DefBlocks);
     SmallVector<BasicBlock *, 32> PHIBlocks;
     IDF.calculate(PHIBlocks);
+
+    // Determine which PHI nodes want to use a value from a detached
+    // predecessor.  Because register state is not preserved across a reattach,
+    // these alloca's cannot be promoted.
+    bool DetachedPred = false;
+    for (unsigned i = 0, e = PHIBlocks.size(); i != e && !DetachedPred; ++i) {
+      BasicBlock *BB = PHIBlocks[i];
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB);
+           PI != E && !DetachedPred; ++PI) {
+        BasicBlock *P = *PI;
+        if (isa<ReattachInst>(P->getTerminator())) {
+          DEBUG(dbgs() << "Alloca " << *AI << " has use reattached from " <<
+                P->getName() << "\n");
+          DetachedPred = true;
+        }
+      }
+    }
+    if (DetachedPred) {
+      RemoveFromAllocasList(AllocaNum);
+      ++NumAllocaWithDetachedUses;
+      continue;
+    }
+
+    // Remember the dbg.declare intrinsic describing this alloca, if any.
+    if (!Info.DbgDeclares.empty())
+      AllocaDbgDeclares[AllocaNum] = Info.DbgDeclares;
+
+    // Keep the reverse mapping of the 'Allocas' array for the rename pass.
+    AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
+
+    // At this point, we're committed to promoting the alloca using IDF's, and
+    // the standard SSA construction algorithm.
     if (PHIBlocks.size() > 1)
       llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) {
         return BBNumbers.lookup(A) < BBNumbers.lookup(B);
@@ -791,7 +820,7 @@ void PromoteMem2Reg::run() {
 /// These are blocks which lead to uses.  Knowing this allows us to avoid
 /// inserting PHI nodes into blocks which don't lead to uses (thus, the
 /// inserted phi nodes would be dead).
-void PromoteMem2Reg::ComputeLiveInBlocks(
+static void ExternComputeLiveInBlocks(
     AllocaInst *AI, AllocaInfo &Info,
     const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
     SmallPtrSetImpl<BasicBlock *> &LiveInBlocks) {
@@ -860,6 +889,62 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
   }
 }
 
+void PromoteMem2Reg::ComputeLiveInBlocks(
+    AllocaInst *AI, AllocaInfo &Info,
+    const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+    SmallPtrSetImpl<BasicBlock *> &LiveInBlocks) {
+  ExternComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+}
+
+// \brief Augmentation is isAllocaPromotable to handle detach and reattach.
+//
+// TODO: Replace the implementation of this method to use an analysis of
+// parallel regions.
+bool llvm::isAllocaParallelPromotable(const AllocaInst *AIP,
+                                      DominatorTree &DT) {
+  AllocaInst* AI = const_cast<AllocaInst*>(AIP);
+  AllocaInfo Info;
+  LargeBlockInfo LBI;
+  ForwardIDFCalculator IDF(DT);
+
+  // Calculate the set of read and write-locations for each alloca.  This is
+  // analogous to finding the 'uses' and 'definitions' of each variable.
+  Info.AnalyzeAlloca(AI);
+
+  if (Info.OnlyUsedInOneBlock) return true;
+
+  // Unique the set of defining blocks for efficient lookup.
+  SmallPtrSet<BasicBlock *, 32> DefBlocks;
+  DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end());
+
+  // Determine which blocks the value is live in.  These are blocks which lead
+  // to uses.
+  SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+  ExternComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+
+  // Determine which blocks need PHI nodes and see if we can optimize out some
+  // work by avoiding insertion of dead phi nodes.
+  IDF.setLiveInBlocks(LiveInBlocks);
+  IDF.setDefiningBlocks(DefBlocks);
+  SmallVector<BasicBlock *, 32> PHIBlocks;
+  IDF.calculate(PHIBlocks);
+
+  // Determine which PHI nodes want to use a value from a detached predecessor.
+  // Because register state is not preserved across a reattach, these alloca's
+  // cannot be promoted.
+  for (unsigned i = 0, e = PHIBlocks.size(); i != e; ++i) {
+    BasicBlock *BB = PHIBlocks[i];
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB);
+         PI != E; ++PI) {
+      BasicBlock *P = *PI;
+      if (isa<ReattachInst>(P->getTerminator()))
+        return false;
+    }
+  }
+
+  return true;
+}
+
 /// Queue a phi-node to be added to a basic-block for a specific Alloca.
 ///
 /// Returns true if there wasn't already a phi-node for that variable
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 9e5fb0e7172d4d..c9dced38c694f2 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -44,11 +44,18 @@ static AvailableValsTy &getAvailableVals(void *AV) {
   return *static_cast<AvailableValsTy*>(AV);
 }
 
+typedef DenseMap<BasicBlock*, bool> ValIsDetachedTy;
+static ValIsDetachedTy &getValIsDetached(void *VID) {
+  return *static_cast<ValIsDetachedTy*>(VID);
+}
+
 SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode *> *NewPHI)
   : InsertedPHIs(NewPHI) {}
 
 SSAUpdater::~SSAUpdater() {
   delete static_cast<AvailableValsTy*>(AV);
+  if (VID)
+    delete static_cast<ValIsDetachedTy*>(VID);
 }
 
 void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
@@ -56,6 +63,10 @@ void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
     AV = new AvailableValsTy();
   else
     getAvailableVals(AV).clear();
+  if (!VID)
+    VID = new ValIsDetachedTy();
+  else
+    getValIsDetached(VID).clear();
   ProtoType = Ty;
   ProtoName = Name;
 }
@@ -107,6 +118,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // predecessor.
   SmallVector<std::pair<BasicBlock *, Value *>, 8> PredValues;
   Value *SingularValue = nullptr;
+  BasicBlock *DetachPred = nullptr, *ReattachPred = nullptr;
 
   // We can get our predecessor info by walking the pred_iterator list, but it
   // is relatively slow.  If we already have PHI nodes in this block, walk one
@@ -115,6 +127,12 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) {
       BasicBlock *PredBB = SomePhi->getIncomingBlock(i);
       Value *PredVal = GetValueAtEndOfBlock(PredBB);
+      if (isa<ReattachInst>(PredBB->getTerminator())) {
+        ReattachPred = PredBB;
+        continue;
+      }
+      if (isa<DetachInst>(PredBB->getTerminator()))
+        DetachPred = PredBB;
       PredValues.push_back(std::make_pair(PredBB, PredVal));
 
       // Compute SingularValue.
@@ -128,6 +146,12 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
       BasicBlock *PredBB = *PI;
       Value *PredVal = GetValueAtEndOfBlock(PredBB);
+      if (isa<ReattachInst>(PredBB->getTerminator())) {
+        ReattachPred = PredBB;
+        continue;
+      }
+      if (isa<DetachInst>(PredBB->getTerminator()))
+        DetachPred = PredBB;
       PredValues.push_back(std::make_pair(PredBB, PredVal));
 
       // Compute SingularValue.
@@ -138,6 +162,18 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
         SingularValue = nullptr;
     }
   }
+  // Record any values we discover whose definitions occur in detached blocks.
+  if (ReattachPred) {
+    assert(DetachPred &&
+           "Reattached predecessor of a block with no detached predecessor.");
+    Value *DetachVal = GetValueAtEndOfBlock(DetachPred);
+    PredValues.push_back(std::make_pair(ReattachPred, DetachVal));
+    Value *ReattachVal = GetValueAtEndOfBlock(ReattachPred);
+    if (ReattachVal != DetachVal) {
+      SingularValue = nullptr;
+      getValIsDetached(VID)[BB] = true;
+    }
+  }
 
   // If there are no predecessors, just return undef.
   if (PredValues.empty())
@@ -187,6 +223,10 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   return InsertedPHI;
 }
 
+bool SSAUpdater::GetValueIsDetachedInBlock(BasicBlock *BB) {
+  return getValIsDetached(VID)[BB];
+}
+
 void SSAUpdater::RewriteUse(Use &U) {
   Instruction *User = cast<Instruction>(U.getUser());
 
@@ -274,6 +314,18 @@ class SSAUpdaterTraits<SSAUpdater> {
     return UndefValue::get(Updater->ProtoType);
   }
 
+  /// BlockReattaches - Return true if this block is terminated with a
+  /// reattach, false otherwise.
+  static bool BlockReattaches(BasicBlock *BB, SSAUpdater *Updater) {
+    return isa<ReattachInst>(BB->getTerminator());
+  }
+
+  /// BlockReattaches - Return true if this block is terminated with a
+  /// detach, false otherwise.
+  static bool BlockDetaches(BasicBlock *BB, SSAUpdater *Updater) {
+    return isa<DetachInst>(BB->getTerminator());
+  }
+
   /// CreateEmptyPHI - Create a new PHI instruction in the specified block.
   /// Reserve space for the operands but do not fill them in yet.
   static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds,
@@ -326,7 +378,8 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
   if (Value *V = AvailableVals[BB])
     return V;
 
-  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs,
+                                  &getValIsDetached(VID));
   return Impl.GetValue(BB);
 }
 
@@ -448,7 +501,14 @@ run(const SmallVectorImpl<Instruction *> &Insts) const {
   // Okay, now we rewrite all loads that use live-in values in the loop,
   // inserting PHI nodes as necessary.
   for (LoadInst *ALoad : LiveInLoads) {
-    Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent());
+    BasicBlock *BB = ALoad->getParent();
+    Value *NewVal = SSA.GetValueInMiddleOfBlock(BB);
+
+    // Skip loads whose definitions are detached.
+    if (Instruction *Def = dyn_cast<Instruction>(NewVal))
+      if (SSA.GetValueIsDetachedInBlock(Def->getParent()))
+        continue;
+
     replaceLoadWithValue(ALoad, NewVal);
 
     // Avoid assertions in unreachable code.
@@ -463,6 +523,8 @@ run(const SmallVectorImpl<Instruction *> &Insts) const {
   // Now that everything is rewritten, delete the old instructions from the
   // function.  They should all be dead now.
   for (Instruction *User : Insts) {
+    if (isa<StoreInst>(User) && !User->use_empty()) continue;
+
     // If this is a load that still has uses, then the load must have been added
     // as a live value in the SSAUpdate data structure for a block (e.g. because
     // the loaded value was stored later).  In this case, we need to recursively
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 03b73954321d86..7480b94e34ab61 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -66,6 +66,8 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -5751,6 +5753,14 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
   return false;
 }
 
+static bool BlockIsEntryOfDetachedCtx(const BasicBlock *BB) {
+  if (const BasicBlock *PredBB = BB->getSinglePredecessor())
+    if (const DetachInst *DI = dyn_cast<DetachInst>(PredBB->getTerminator()))
+      if (DI->getDetached() == BB)
+        return true;
+  return false;
+}
+
 bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
                                           IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
@@ -5769,6 +5779,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
        (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
+      !BlockIsEntryOfDetachedCtx(BB) &&
       !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB))
     return true;
 
@@ -5993,6 +6004,139 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
   return false;
 }
 
+/// If BB immediately syncs and BB's predecessor detaches, serialize
+/// the sync and detach.  This will allow normal serial
+/// optimization passes to remove the blocks appropriately.  Return
+/// false if BB does not terminate with a reattach.
+static bool serializeDetachToImmediateSync(BasicBlock *BB) {
+  Instruction *I = BB->getFirstNonPHIOrDbgOrLifetime();
+  if (isa<SyncInst>(I)) {
+    // This block is empty
+    bool Changed = false;
+    // Collect the detach and reattach predecessors.
+    SmallSet<DetachInst *, 4> DetachPreds;
+    SmallVector<Instruction *, 4> ReattachPreds;
+    for (BasicBlock *PredBB : predecessors(BB)) {
+      if (DetachInst *DI = dyn_cast<DetachInst>(PredBB->getTerminator()))
+        DetachPreds.insert(DI);
+
+      if (ReattachInst *RI = dyn_cast<ReattachInst>(PredBB->getTerminator()))
+        ReattachPreds.push_back(RI);
+    }
+    Value *SyncRegion = cast<SyncInst>(I)->getSyncRegion();
+    for (DetachInst *DI : DetachPreds) {
+      BasicBlock *Detached = DI->getDetached();
+
+      // Replace the detach with a branch to the detached block.
+      BB->removePredecessor(DI->getParent());
+      ReplaceInstWithInst(DI, BranchInst::Create(Detached));
+
+      // Move static alloca instructions in the detached block to the
+      // appropriate entry block.
+      MoveStaticAllocasInBlock(cast<Instruction>(SyncRegion)->getParent(),
+                               Detached, ReattachPreds);
+      // We should not need to add new llvm.stacksave/llvm.stackrestore
+      // intrinsics, because we're not introducing new alloca's into a loop.
+      Changed = true;
+    }
+    for (Instruction *RI : ReattachPreds) {
+      // Replace the reattach with an unconditional branch.
+      ReplaceInstWithInst(RI, BranchInst::Create(BB));
+      Changed = true;
+    }
+    return Changed;
+  }
+  return false;
+}
+
+/// If BB immediately reattaches and BB's predecessor detaches,
+/// serialize the reattach and detach.  This will allow normal serial
+/// optimization passes to remove the blocks appropriately.  Return
+/// false if BB does not terminate with a reattach or predecessor does
+/// terminate with detach.
+static bool serializeTrivialDetachedBlock(BasicBlock *BB) {
+  Instruction *I = BB->getFirstNonPHI();
+  if (ReattachInst *RI = dyn_cast<ReattachInst>(I)) {
+    // This detached block is empty
+    // Scan predecessors to verify that all of them detach BB.
+    for (BasicBlock *PredBB : predecessors(BB)) {
+      if (!isa<DetachInst>(PredBB->getTerminator()))
+	return false;
+    }
+    // All predecessors detach BB, so we can serialize
+    for (BasicBlock *PredBB : predecessors(BB)) {
+      DetachInst *DI = dyn_cast<DetachInst>(PredBB->getTerminator());
+      BasicBlock *Detached = DI->getDetached();
+      BasicBlock *Continue = DI->getContinue();
+      assert(RI->getSuccessor(0) == Continue &&
+             "Reattach destination does not match continue block of associated detach.");
+      // Remove the predecessor through the detach from the continue
+      // block.
+      Continue->removePredecessor(PredBB);
+      // Serialize the detach: replace it with an unconditional branch.
+      ReplaceInstWithInst(DI, BranchInst::Create(Detached));
+    }
+    // Serialize the reattach: replace it with an unconditional branch.
+    ReplaceInstWithInst(RI, BranchInst::Create(RI->getSuccessor(0)));
+    return true;
+  }
+  return false;
+}
+
+/// If BB detaches an CFG that cannot reach the continuation, serialize the
+/// detach.  Assuming the CFG is valid, this scenario arises when the detached
+/// CFG is terminated by unreachable instructions.
+static bool serializeDetachOfUnreachable(BasicBlock *BB) {
+  // This method assumes that the detached CFG is valid.
+  Instruction *I = BB->getTerminator();
+  if (DetachInst *DI = dyn_cast<DetachInst>(I)) {
+    // Check if continuation of the detach is not reached by reattach
+    // instructions.  If the detached CFG is valid, then the detached CFG must
+    // be terminated by unreachable instructions.
+    BasicBlock *Continue = DI->getContinue();
+    for (BasicBlock *PredBB : predecessors(Continue))
+      if (isa<ReattachInst>(PredBB->getTerminator()))
+        return false;
+    // TODO: Add stronger checks to make sure the detached CFG is valid.
+    // Remove the predecessor through the detach from the continue
+    // block.
+    Continue->removePredecessor(BB);
+    // Replace the detach with a branch to the detached block.
+    ReplaceInstWithInst(DI, BranchInst::Create(DI->getDetached()));
+    return true;
+  }
+  return false;
+}
+
+// Remove any syncs whose sync region is empty, meaning that the region contains
+// no detach instructions.  These sync instructions don't synchronize anything,
+// so they can be removed.
+static bool removeEmptySyncs(BasicBlock *BB) {
+  if (SyncInst *SI = dyn_cast<SyncInst>(BB->getTerminator())) {
+    // Get the sync region containing this sync
+    Value *SyncRegion = SI->getSyncRegion();
+    bool SyncRegionIsEmpty = true;
+    SmallVector<SyncInst *, 4> Syncs;
+    // Scan the Tapir instructions in this sync region.
+    for (User *U : SyncRegion->users()) {
+      // If the sync region contains a detach or a reattach, then it's not
+      // empty.
+      if (isa<DetachInst>(U) || isa<ReattachInst>(U))
+        SyncRegionIsEmpty = false;
+      // Collect the syncs in this region.
+      else if (isa<SyncInst>(U))
+        Syncs.push_back(cast<SyncInst>(U));
+    }
+    // If the sync region is empty, then remove all sync instructions in it.
+    if (SyncRegionIsEmpty) {
+      for (SyncInst *Sync : Syncs)
+        ReplaceInstWithInst(Sync, BranchInst::Create(Sync->getSuccessor(0)));
+      return true;
+    }
+  }
+  return false;
+}
+
 bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
   bool Changed = false;
 
@@ -6018,6 +6162,14 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
   // Check for and remove branches that will always cause undefined behavior.
   Changed |= removeUndefIntroducingPredecessor(BB);
 
+  // Check for and remove trivial detached blocks.
+  Changed |= serializeTrivialDetachedBlock(BB);
+  Changed |= serializeDetachToImmediateSync(BB);
+  Changed |= serializeDetachOfUnreachable(BB);
+
+  // Check for and remove sync instructions in empty sync regions.
+  Changed |= removeEmptySyncs(BB);
+
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
diff --git a/llvm/lib/Transforms/Utils/TapirUtils.cpp b/llvm/lib/Transforms/Utils/TapirUtils.cpp
new file mode 100644
index 00000000000000..cba2f39411076d
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/TapirUtils.cpp
@@ -0,0 +1,318 @@
+//===-- TapirUtils.cpp - Utility methods for Tapir -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file utility methods for handling code containing Tapir instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/TapirUtils.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tapirutils"
+
+/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
+/// block. Allocas used in inalloca calls and allocas of dynamic array size
+/// cannot be static.
+/// (Borrowed from Transforms/Utils/InlineFunction.cpp)
+static bool allocaWouldBeStaticInEntry(const AllocaInst *AI) {
+  return isa<Constant>(AI->getArraySize()) && !AI->isUsedWithInAlloca();
+}
+
+// Check whether this Value is used by a lifetime intrinsic.
+static bool isUsedByLifetimeMarker(Value *V) {
+  for (User *U : V->users()) {
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::lifetime_start:
+      case Intrinsic::lifetime_end:
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Check whether the given alloca already has
+// lifetime.start or lifetime.end intrinsics.
+static bool hasLifetimeMarkers(AllocaInst *AI) {
+  Type *Ty = AI->getType();
+  Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(),
+                                       Ty->getPointerAddressSpace());
+  if (Ty == Int8PtrTy)
+    return isUsedByLifetimeMarker(AI);
+
+  // Do a scan to find all the casts to i8*.
+  for (User *U : AI->users()) {
+    if (U->getType() != Int8PtrTy) continue;
+    if (U->stripPointerCasts() != AI) continue;
+    if (isUsedByLifetimeMarker(U))
+      return true;
+  }
+  return false;
+}
+
+// Move static allocas in a cloned block into the entry block of helper.  Leave
+// lifetime markers behind for those static allocas.  Returns true if the cloned
+// block still contains dynamic allocas, which cannot be moved.
+bool llvm::MoveStaticAllocasInBlock(
+    BasicBlock *Entry,
+    BasicBlock *Block,
+    SmallVectorImpl<Instruction *> &ExitPoints) {
+  Function *F = Entry->getParent();
+  SmallVector<AllocaInst *, 4> StaticAllocas;
+  bool ContainsDynamicAllocas = false;
+  BasicBlock::iterator InsertPoint = Entry->begin();
+  for (BasicBlock::iterator I = Block->begin(),
+         E = Block->end(); I != E; ) {
+    AllocaInst *AI = dyn_cast<AllocaInst>(I++);
+    if (!AI) continue;
+
+    if (!allocaWouldBeStaticInEntry(AI)) {
+      ContainsDynamicAllocas = true;
+      continue;
+    }
+
+    StaticAllocas.push_back(AI);
+
+    // Scan for the block of allocas that we can move over, and move them
+    // all at once.
+    while (isa<AllocaInst>(I) &&
+           allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) {
+      StaticAllocas.push_back(cast<AllocaInst>(I));
+      ++I;
+    }
+
+    // Transfer all of the allocas over in a block.  Using splice means
+    // that the instructions aren't removed from the symbol table, then
+    // reinserted.
+    Entry->getInstList().splice(
+        InsertPoint, Block->getInstList(), AI->getIterator(), I);
+  }
+  // Move any dbg.declares describing the allocas into the entry basic block.
+  DIBuilder DIB(*F->getParent());
+  for (auto &AI : StaticAllocas)
+    replaceDbgDeclareForAlloca(AI, AI, DIB, /*Deref=*/false);
+
+  // Move any syncregion_start's into the entry basic block.
+  for (BasicBlock::iterator I = Block->begin(),
+         E = Block->end(); I != E; ) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I++);
+    if (!II) continue;
+    if (Intrinsic::syncregion_start != II->getIntrinsicID())
+      continue;
+
+    while (isa<IntrinsicInst>(I) &&
+           Intrinsic::syncregion_start ==
+           cast<IntrinsicInst>(I)->getIntrinsicID())
+        ++I;
+
+    Entry->getInstList().splice(
+        InsertPoint, Block->getInstList(), II->getIterator(), I);
+  }
+
+  // Leave lifetime markers for the static alloca's, scoping them to the
+  // from cloned block to cloned exit.
+  if (!StaticAllocas.empty()) {
+    IRBuilder<> Builder(&Block->front());
+    for (unsigned ai = 0, ae = StaticAllocas.size(); ai != ae; ++ai) {
+      AllocaInst *AI = StaticAllocas[ai];
+      // Don't mark swifterror allocas. They can't have bitcast uses.
+      if (AI->isSwiftError())
+        continue;
+
+      // If the alloca is already scoped to something smaller than the whole
+      // function then there's no need to add redundant, less accurate markers.
+      if (hasLifetimeMarkers(AI))
+        continue;
+
+      // Try to determine the size of the allocation.
+      ConstantInt *AllocaSize = nullptr;
+      if (ConstantInt *AIArraySize =
+          dyn_cast<ConstantInt>(AI->getArraySize())) {
+        auto &DL = F->getParent()->getDataLayout();
+        Type *AllocaType = AI->getAllocatedType();
+        uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
+        uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
+
+        // Don't add markers for zero-sized allocas.
+        if (AllocaArraySize == 0)
+          continue;
+
+        // Check that array size doesn't saturate uint64_t and doesn't
+        // overflow when it's multiplied by type size.
+        if (AllocaArraySize != ~0ULL &&
+            UINT64_MAX / AllocaArraySize >= AllocaTypeSize) {
+          AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
+                                        AllocaArraySize * AllocaTypeSize);
+        }
+      }
+
+      Builder.CreateLifetimeStart(AI, AllocaSize);
+      for (Instruction *ExitPoint : ExitPoints) {
+        IRBuilder<>(ExitPoint).CreateLifetimeEnd(AI, AllocaSize);
+      }
+    }
+  }
+
+  return ContainsDynamicAllocas;
+}
+
+
+/// SerializeDetachedCFG - Serialize the sub-CFG detached by the
+/// specified detach instruction.  Removes the detach instruction and
+/// returns a pointer to the branch instruction that replaces it.
+///
+BranchInst *llvm::SerializeDetachedCFG(DetachInst *DI, DominatorTree *DT) {
+  // Get the parent of the detach instruction.
+  BasicBlock *Detacher = DI->getParent();
+  // Get the detached block and continuation of this detach.
+  BasicBlock *Detached = DI->getDetached();
+  BasicBlock *Continuation = DI->getContinue();
+
+  assert(Detached->getSinglePredecessor() &&
+         "Detached block has multiple predecessors.");
+
+  // Get the detach edge from DI.
+  BasicBlockEdge DetachEdge(Detacher, Detached);
+
+  // Collect the reattaches into the continuation.  If DT is
+  // available, verify that all reattaches are dominated by the detach
+  // edge from DI.
+  SmallVector<ReattachInst *, 8> Reattaches;
+  // If we only find a single reattach into the continuation, capture
+  // it so we can later update the dominator tree.
+  BasicBlock *SingleReattacher = nullptr;
+  int ReattachesFound = 0;
+  for (auto PI = pred_begin(Continuation), PE = pred_end(Continuation);
+       PI != PE; PI++) {
+    BasicBlock *Pred = *PI;
+    // Skip the detacher.
+    if (Detacher == Pred) continue;
+    // Record the reattaches found.
+    if (isa<ReattachInst>(Pred->getTerminator())) {
+      ReattachesFound++;
+      if (!SingleReattacher)
+        SingleReattacher = Pred;
+      if (DT) {
+        assert(DT->dominates(DetachEdge, Pred) &&
+               "Detach edge does not dominate a reattach into its continuation.");
+      }
+      Reattaches.push_back(cast<ReattachInst>(Pred->getTerminator()));
+    }
+  }
+  // TODO: It's possible to detach a CFG that does not terminate with a
+  // reattach.  For example, optimizations can create detached CFG's that are
+  // terminated by unreachable terminators only.  Some of these special cases
+  // lead to problems with other passes, however, and this check will identify
+  // those special cases early while we sort out those issues.
+  assert(!Reattaches.empty() && "No reattach found for detach.");
+
+  // Replace each reattach with branches to the continuation.
+  for (ReattachInst *RI : Reattaches) {
+    BranchInst *ReplacementBr = BranchInst::Create(Continuation, RI);
+    ReplacementBr->setDebugLoc(RI->getDebugLoc());
+    RI->eraseFromParent();
+  }
+
+  // Replace the new detach with a branch to the detached CFG.
+  BranchInst *ReplacementBr = BranchInst::Create(Detached, DI);
+  ReplacementBr->setDebugLoc(DI->getDebugLoc());
+  DI->eraseFromParent();
+
+  // Update the dominator tree.
+  if (DT)
+    if (DT->dominates(Detacher, Continuation) && 1 == ReattachesFound)
+      DT->changeImmediateDominator(Continuation, SingleReattacher);
+
+  return ReplacementBr;
+}
+
+/// GetDetachedCtx - Get the entry basic block to the detached context
+/// that contains the specified block.
+///
+BasicBlock *llvm::GetDetachedCtx(BasicBlock *BB) {
+  return const_cast<BasicBlock *>(
+      GetDetachedCtx(const_cast<const BasicBlock *>(BB)));
+}
+
+const BasicBlock *llvm::GetDetachedCtx(const BasicBlock *BB) {
+  // Traverse the CFG backwards until we either reach the entry block
+  // of the function or we find a detach instruction that detaches the
+  // current block.
+  SmallPtrSet<const BasicBlock *, 32> Visited;
+  SmallVector<const BasicBlock *, 32> WorkList;
+  WorkList.push_back(BB);
+  while (!WorkList.empty()) {
+    const BasicBlock *CurrBB = WorkList.pop_back_val();
+    if (!Visited.insert(CurrBB).second)
+      continue;
+
+    for (auto PI = pred_begin(CurrBB), PE = pred_end(CurrBB);
+         PI != PE; ++PI) {
+      const BasicBlock *PredBB = *PI;
+
+      // Skip predecessors via reattach instructions.  The detacher
+      // block corresponding to this reattach is also a predecessor of
+      // the current basic block.
+      if (isa<ReattachInst>(PredBB->getTerminator()))
+        continue;
+
+      // If the predecessor is terminated by a detach, check to see if
+      // that detach detached the current basic block.
+      if (isa<DetachInst>(PredBB->getTerminator())) {
+        const DetachInst *DI = cast<DetachInst>(PredBB->getTerminator());
+        if (DI->getDetached() == CurrBB)
+          // Return the current block, which is the entry of this detached
+          // sub-CFG.
+          return CurrBB;
+      }
+
+      // Otherwise, add the predecessor block to the work list to
+      // search.
+      WorkList.push_back(PredBB);
+    }
+  }
+
+  // Our search didn't find anything, so return the entry of the
+  // function containing the given block.
+  return &(BB->getParent()->getEntryBlock());
+}
+
+/// isCriticalContinueEdge - Return true if the specified edge is a critical
+/// detach-continue edge.  Critical detach-continue edges are critical edges -
+/// from a block with multiple successors to a block with multiple predecessors
+/// - even after ignoring all reattach edges.
+bool llvm::isCriticalContinueEdge(const TerminatorInst *TI, unsigned SuccNum) {
+  assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!");
+  if (TI->getNumSuccessors() == 1) return false;
+
+  // Edge must come from a detach.
+  if (!isa<DetachInst>(TI)) return false;
+  // Edge must go to the continuation.
+  if (SuccNum != 1) return false;
+
+  const BasicBlock *Dest = TI->getSuccessor(SuccNum);
+  const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest);
+
+  // If there is more than one predecessor, this is a critical edge...
+  assert(I != E && "No preds, but we have an edge to the block?");
+  const BasicBlock *DetachPred = TI->getParent();
+  for (; I != E; ++I) {
+    if (DetachPred == *I) continue;
+    if (isa<ReattachInst>((*I)->getTerminator())) continue;
+    return true;
+  }
+  return false;
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c45dee590b8452..3d7800dd9b82b0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2741,6 +2741,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   assert(VectorPH && "Invalid loop structure");
   assert(ExitBlock && "Must have an exit block");
 
+  BasicBlock *sync_split = nullptr;
+  if (isa<SyncInst>(VectorPH->getTerminator())) {
+    sync_split = VectorPH->splitBasicBlockWithTerminator("vector.sync_split");
+    DT->splitBlock(sync_split);
+    //DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
+    DT->verifyDomTree();
+    VectorPH = sync_split;
+  }
+
   // Some loops have a single integer induction variable, while other loops
   // don't. One example is c++ iterators that often have multiple pointer
   // induction variables. In the code below we also support a case where we
@@ -2773,6 +2782,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
     ParentLoop->addChildLoop(Lp);
     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
+    if (sync_split) ParentLoop->addBasicBlockToLoop(sync_split, *LI);
   } else {
     LI->addTopLevelLoop(Lp);
   }
diff --git a/llvm/microbenchmarks/everything/everything.c b/llvm/microbenchmarks/everything/everything.c
new file mode 100644
index 00000000000000..d2dd0aa96e5f2c
--- /dev/null
+++ b/llvm/microbenchmarks/everything/everything.c
@@ -0,0 +1,32 @@
+#include <cilk/cilk.h>
+#include <math.h>
+
+int foo() {
+  return 10;
+}
+
+int bar();
+
+int main() {
+  double c = foo();
+  cilk_spawn {
+    c += sin(c);
+    c += sin(c);
+    c += sin(c);
+  }
+  cilk_spawn {
+    cilk_spawn {
+      c += sin(c);
+      c += sin(c);
+      c += sin(c);
+    }
+  }
+  cilk_spawn {
+    if (c) {
+      c += sin(c);
+      c += sin(c);
+      c += sin(c);
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/everything/everything.ll b/llvm/microbenchmarks/everything/everything.ll
new file mode 100644
index 00000000000000..249549a7131cd5
--- /dev/null
+++ b/llvm/microbenchmarks/everything/everything.ll
@@ -0,0 +1,118 @@
+; ModuleID = 'everything.c'
+source_filename = "everything.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @foo() #0 {
+entry:
+  ret i32 10
+}
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca double, align 8
+  store i32 0, i32* %retval, align 4
+  %call = call i32 @foo()
+  %conv = sitofp i32 %call to double
+  store double %conv, double* %c, align 8
+  detach label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  %0 = bitcast i32 undef to i32
+  %1 = load double, double* %c, align 8
+  %call1 = call double @sin(double %1) #2
+  %2 = load double, double* %c, align 8
+  %add = fadd double %2, %call1
+  store double %add, double* %c, align 8
+  %3 = load double, double* %c, align 8
+  %call2 = call double @sin(double %3) #2
+  %4 = load double, double* %c, align 8
+  %add3 = fadd double %4, %call2
+  store double %add3, double* %c, align 8
+  %5 = load double, double* %c, align 8
+  %call4 = call double @sin(double %5) #2
+  %6 = load double, double* %c, align 8
+  %add5 = fadd double %6, %call4
+  store double %add5, double* %c, align 8
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+  detach label %det.achd6, label %det.cont15
+
+det.achd6:                                        ; preds = %det.cont
+  %7 = bitcast i32 undef to i32
+  detach label %det.achd7, label %det.cont14
+
+det.achd7:                                        ; preds = %det.achd6
+  %8 = bitcast i32 undef to i32
+  %9 = load double, double* %c, align 8
+  %call8 = call double @sin(double %9) #2
+  %10 = load double, double* %c, align 8
+  %add9 = fadd double %10, %call8
+  store double %add9, double* %c, align 8
+  %11 = load double, double* %c, align 8
+  %call10 = call double @sin(double %11) #2
+  %12 = load double, double* %c, align 8
+  %add11 = fadd double %12, %call10
+  store double %add11, double* %c, align 8
+  %13 = load double, double* %c, align 8
+  %call12 = call double @sin(double %13) #2
+  %14 = load double, double* %c, align 8
+  %add13 = fadd double %14, %call12
+  store double %add13, double* %c, align 8
+  reattach label %det.cont14
+
+det.cont14:                                       ; preds = %det.achd7, %det.achd6
+  reattach label %det.cont15
+
+det.cont15:                                       ; preds = %det.cont14, %det.cont
+  detach label %det.achd16, label %det.cont23
+
+det.achd16:                                       ; preds = %det.cont15
+  %15 = bitcast i32 undef to i32
+  %16 = load double, double* %c, align 8
+  %tobool = fcmp une double %16, 0.000000e+00
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %det.achd16
+  %17 = load double, double* %c, align 8
+  %call17 = call double @sin(double %17) #2
+  %18 = load double, double* %c, align 8
+  %add18 = fadd double %18, %call17
+  store double %add18, double* %c, align 8
+  %19 = load double, double* %c, align 8
+  %call19 = call double @sin(double %19) #2
+  %20 = load double, double* %c, align 8
+  %add20 = fadd double %20, %call19
+  store double %add20, double* %c, align 8
+  %21 = load double, double* %c, align 8
+  %call21 = call double @sin(double %21) #2
+  %22 = load double, double* %c, align 8
+  %add22 = fadd double %22, %call21
+  store double %add22, double* %c, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %det.achd16
+  reattach label %det.cont23
+
+det.cont23:                                       ; preds = %if.end, %det.cont15
+  %23 = load double, double* %c, align 8
+  %conv24 = fptosi double %23 to i32
+  ret i32 %conv24
+}
+
+; Function Attrs: nounwind readnone
+declare double @sin(double) #1
+
+attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/everything/simple.c b/llvm/microbenchmarks/everything/simple.c
new file mode 100644
index 00000000000000..aa4252c4bc3890
--- /dev/null
+++ b/llvm/microbenchmarks/everything/simple.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = 0;
+  for (int i=0; i < 1000; i++) {
+    cilk_spawn {
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/everything/simple.ll b/llvm/microbenchmarks/everything/simple.ll
new file mode 100644
index 00000000000000..268be428dbd3c6
--- /dev/null
+++ b/llvm/microbenchmarks/everything/simple.ll
@@ -0,0 +1,53 @@
+; ModuleID = 'simple.c'
+source_filename = "simple.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %c, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  detach label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %for.body
+  %1 = bitcast i32 undef to i32
+  %call = call i32 (...) @foo()
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %det.cont
+  %2 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32, i32* %c, align 4
+  ret i32 %3
+}
+
+declare i32 @foo(...) #1
+
+attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/everything/temp.ll b/llvm/microbenchmarks/everything/temp.ll
new file mode 100644
index 00000000000000..5d49d66271d392
--- /dev/null
+++ b/llvm/microbenchmarks/everything/temp.ll
@@ -0,0 +1,24 @@
+; ModuleID = '<stdin>'
+source_filename = "everything.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline norecurse nounwind readnone ssp uwtable
+define i32 @SpawnUnswitch_SmallBlock_RedundantSpawn_foo() local_unnamed_addr #0 {
+entry:
+  ret i32 10
+}
+
+; Function Attrs: noinline norecurse nounwind readnone ssp uwtable
+define i32 @SpawnUnswitch_SmallBlock_RedundantSpawn_main() local_unnamed_addr #0 {
+entry:
+  ret i32 9
+}
+
+attributes #0 = { noinline norecurse nounwind readnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/redundantspawn/complex.c b/llvm/microbenchmarks/redundantspawn/complex.c
new file mode 100644
index 00000000000000..23874168629bd1
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/complex.c
@@ -0,0 +1,32 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      bar();
+      c = 2;
+    }
+    bar();
+    cilk_spawn {
+      cilk_spawn {
+        cilk_spawn {
+          foo();
+        }
+      }
+      bar();
+    }
+    cilk_spawn {
+      cilk_spawn {
+        foo();
+        foo();
+      }
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/redundantspawn/multiple_nested.c b/llvm/microbenchmarks/redundantspawn/multiple_nested.c
new file mode 100644
index 00000000000000..3f9a1f235b183a
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/multiple_nested.c
@@ -0,0 +1,21 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      bar();
+      c = 2;
+    }
+    cilk_spawn {
+      foo();
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/redundantspawn/multiple_redundant.c b/llvm/microbenchmarks/redundantspawn/multiple_redundant.c
new file mode 100644
index 00000000000000..aa52f045e0be6f
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/multiple_redundant.c
@@ -0,0 +1,20 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      cilk_spawn {
+        cilk_spawn {
+          foo();
+          foo();
+        }
+      }
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/redundantspawn/serial.c b/llvm/microbenchmarks/redundantspawn/serial.c
new file mode 100644
index 00000000000000..12b21b6b0ebc38
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/serial.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = foo();
+  if (c > 0) {
+    bar();
+  } else {
+    foo();
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/redundantspawn/simple_spawn.c b/llvm/microbenchmarks/redundantspawn/simple_spawn.c
new file mode 100644
index 00000000000000..41183d94ae8ad0
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/simple_spawn.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/redundantspawn/single_redundant.c b/llvm/microbenchmarks/redundantspawn/single_redundant.c
new file mode 100644
index 00000000000000..33de19ce0f1872
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/single_redundant.c
@@ -0,0 +1,16 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/smallblock/conditional.c b/llvm/microbenchmarks/smallblock/conditional.c
new file mode 100644
index 00000000000000..058b70da06735f
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/conditional.c
@@ -0,0 +1,27 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = foo();
+  if (c*2 > 1) {
+    cilk_spawn {
+      if (c > 1) {
+        bar();
+      } else {
+        foo();
+      }
+    }
+  } else if (c*3 < 1) {
+    cilk_spawn {
+      bar();
+    }
+  } else {
+    cilk_spawn {
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/smallblock/conditional.ll b/llvm/microbenchmarks/smallblock/conditional.ll
new file mode 100644
index 00000000000000..6e796bb19273e1
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/conditional.ll
@@ -0,0 +1,66 @@
+; ModuleID = 'conditional.c'
+source_filename = "conditional.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 (...) @foo() #2
+  %mul = shl nsw i32 %call, 1
+  %cmp = icmp sgt i32 %mul, 1
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  detach label %det.achd, label %if.end17
+
+det.achd:                                         ; preds = %if.then
+  %cmp1 = icmp sgt i32 %call, 1
+  br i1 %cmp1, label %if.then2, label %if.else
+
+if.then2:                                         ; preds = %det.achd
+  %call3 = tail call i32 (...) @bar() #2
+  br label %if.end
+
+if.else:                                          ; preds = %det.achd
+  %call4 = tail call i32 (...) @foo() #2
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then2
+  reattach label %if.end17
+
+if.else5:                                         ; preds = %entry
+  %cmp7 = icmp slt i32 %call, 1
+  br i1 %cmp7, label %if.then8, label %if.else12
+
+if.then8:                                         ; preds = %if.else5
+  detach label %det.achd9, label %if.end17
+
+det.achd9:                                        ; preds = %if.then8
+  %call10 = tail call i32 (...) @bar() #2
+  reattach label %if.end17
+
+if.else12:                                        ; preds = %if.else5
+  detach label %det.achd13, label %if.end17
+
+det.achd13:                                       ; preds = %if.else12
+  %call14 = tail call i32 (...) @foo() #2
+  reattach label %if.end17
+
+if.end17:                                         ; preds = %det.achd9, %if.then8, %det.achd13, %if.else12, %if.then, %if.end
+  ret i32 %call
+}
+
+declare i32 @foo(...) local_unnamed_addr #1
+
+declare i32 @bar(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"}
diff --git a/llvm/microbenchmarks/smallblock/conditional_opt.ll b/llvm/microbenchmarks/smallblock/conditional_opt.ll
new file mode 100644
index 00000000000000..226b5972c852b4
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/conditional_opt.ll
@@ -0,0 +1,89 @@
+; ModuleID = '<stdin>'
+source_filename = "conditional.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @SmallBlock_main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %call = call i32 (...) @foo()
+  store i32 %call, i32* %c, align 4
+  %0 = load i32, i32* %c, align 4
+  %mul = mul nsw i32 %0, 2
+  %cmp = icmp sgt i32 %mul, 1
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  br label %det.achd
+
+det.achd:                                         ; preds = %if.then
+  %1 = bitcast i32 undef to i32
+  %2 = load i32, i32* %c, align 4
+  %cmp1 = icmp sgt i32 %2, 1
+  br i1 %cmp1, label %if.then2, label %if.else
+
+if.then2:                                         ; preds = %det.achd
+  %call3 = call i32 (...) @bar()
+  br label %if.end
+
+if.else:                                          ; preds = %det.achd
+  %call4 = call i32 (...) @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then2
+  br label %det.cont
+
+det.cont:                                         ; preds = %if.end
+  br label %if.end17
+
+if.else5:                                         ; preds = %entry
+  %3 = load i32, i32* %c, align 4
+  %mul6 = mul nsw i32 %3, 3
+  %cmp7 = icmp slt i32 %mul6, 1
+  br i1 %cmp7, label %if.then8, label %if.else12
+
+if.then8:                                         ; preds = %if.else5
+  br label %det.achd9
+
+det.achd9:                                        ; preds = %if.then8
+  %4 = bitcast i32 undef to i32
+  %call10 = call i32 (...) @bar()
+  br label %det.cont11
+
+det.cont11:                                       ; preds = %det.achd9
+  br label %if.end16
+
+if.else12:                                        ; preds = %if.else5
+  br label %det.achd13
+
+det.achd13:                                       ; preds = %if.else12
+  %5 = bitcast i32 undef to i32
+  %call14 = call i32 (...) @foo()
+  br label %det.cont15
+
+det.cont15:                                       ; preds = %det.achd13
+  br label %if.end16
+
+if.end16:                                         ; preds = %det.cont15, %det.cont11
+  br label %if.end17
+
+if.end17:                                         ; preds = %if.end16, %det.cont
+  %6 = load i32, i32* %c, align 4
+  ret i32 %6
+}
+
+declare i32 @foo(...) #1
+
+declare i32 @bar(...) #1
+
+attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"}
diff --git a/llvm/microbenchmarks/smallblock/multiple_nested.c b/llvm/microbenchmarks/smallblock/multiple_nested.c
new file mode 100644
index 00000000000000..3f9a1f235b183a
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/multiple_nested.c
@@ -0,0 +1,21 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      bar();
+      c = 2;
+    }
+    cilk_spawn {
+      foo();
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/smallblock/multiple_spawn.c b/llvm/microbenchmarks/smallblock/multiple_spawn.c
new file mode 100644
index 00000000000000..b551796f050ed0
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/multiple_spawn.c
@@ -0,0 +1,19 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  cilk_spawn {
+    foo();
+    foo();
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/smallblock/serial.c b/llvm/microbenchmarks/smallblock/serial.c
new file mode 100644
index 00000000000000..12b21b6b0ebc38
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/serial.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = foo();
+  if (c > 0) {
+    bar();
+  } else {
+    foo();
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/smallblock/simple_spawn.c b/llvm/microbenchmarks/smallblock/simple_spawn.c
new file mode 100644
index 00000000000000..41183d94ae8ad0
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/simple_spawn.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/base_negative.c b/llvm/microbenchmarks/spawnrestructure/base_negative.c
new file mode 100644
index 00000000000000..3718ca3466844c
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/base_negative.c
@@ -0,0 +1,20 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  cilk_spawn {
+    foo();
+    foo();
+  }
+  bar();
+  return 0;
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/base_negative.ll b/llvm/microbenchmarks/spawnrestructure/base_negative.ll
new file mode 100644
index 00000000000000..b92b96b872d09c
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/base_negative.ll
@@ -0,0 +1,46 @@
+; ModuleID = 'base_negative.c'
+source_filename = "base_negative.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  detach label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  %0 = bitcast i32 undef to i32
+  %call = call i32 (...) @foo()
+  %call1 = call i32 (...) @bar()
+  store i32 2, i32* %c, align 4
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+  detach label %det.achd2, label %det.cont5
+
+det.achd2:                                        ; preds = %det.cont
+  %1 = bitcast i32 undef to i32
+  %call3 = call i32 (...) @foo()
+  %call4 = call i32 (...) @foo()
+  reattach label %det.cont5
+
+det.cont5:                                        ; preds = %det.achd2, %det.cont
+  %call6 = call i32 (...) @bar()
+  ret i32 0
+}
+
+declare i32 @foo(...) #1
+
+declare i32 @bar(...) #1
+
+attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"}
diff --git a/llvm/microbenchmarks/spawnrestructure/base_positive.c b/llvm/microbenchmarks/spawnrestructure/base_positive.c
new file mode 100644
index 00000000000000..7e3d0546efd52b
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/base_positive.c
@@ -0,0 +1,19 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  cilk_spawn {
+    foo();
+    foo();
+  }
+  return bar();
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/base_positive.ll b/llvm/microbenchmarks/spawnrestructure/base_positive.ll
new file mode 100644
index 00000000000000..8055cdfe786d67
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/base_positive.ll
@@ -0,0 +1,46 @@
+; ModuleID = 'base_positive.c'
+source_filename = "base_positive.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  detach label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  %0 = bitcast i32 undef to i32
+  %call = call i32 (...) @foo()
+  %call1 = call i32 (...) @bar()
+  store i32 2, i32* %c, align 4
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+  detach label %det.achd2, label %det.cont5
+
+det.achd2:                                        ; preds = %det.cont
+  %1 = bitcast i32 undef to i32
+  %call3 = call i32 (...) @foo()
+  %call4 = call i32 (...) @foo()
+  reattach label %det.cont5
+
+det.cont5:                                        ; preds = %det.achd2, %det.cont
+  %call6 = call i32 (...) @bar()
+  ret i32 %call6
+}
+
+declare i32 @foo(...) #1
+
+declare i32 @bar(...) #1
+
+attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"}
diff --git a/llvm/microbenchmarks/spawnrestructure/complex.c b/llvm/microbenchmarks/spawnrestructure/complex.c
new file mode 100644
index 00000000000000..23874168629bd1
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/complex.c
@@ -0,0 +1,32 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      bar();
+      c = 2;
+    }
+    bar();
+    cilk_spawn {
+      cilk_spawn {
+        cilk_spawn {
+          foo();
+        }
+      }
+      bar();
+    }
+    cilk_spawn {
+      cilk_spawn {
+        foo();
+        foo();
+      }
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/multiple_nested.c b/llvm/microbenchmarks/spawnrestructure/multiple_nested.c
new file mode 100644
index 00000000000000..3f9a1f235b183a
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/multiple_nested.c
@@ -0,0 +1,21 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      bar();
+      c = 2;
+    }
+    cilk_spawn {
+      foo();
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/serial.c b/llvm/microbenchmarks/spawnrestructure/serial.c
new file mode 100644
index 00000000000000..12b21b6b0ebc38
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/serial.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = foo();
+  if (c > 0) {
+    bar();
+  } else {
+    foo();
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/simple_spawn.c b/llvm/microbenchmarks/spawnrestructure/simple_spawn.c
new file mode 100644
index 00000000000000..41183d94ae8ad0
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/simple_spawn.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/spawnunswitch/simple.c b/llvm/microbenchmarks/spawnunswitch/simple.c
new file mode 100644
index 00000000000000..d817a44c676419
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/simple.c
@@ -0,0 +1,16 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = foo();
+  int d = bar();
+  cilk_spawn {
+    if (c) {
+      foo();
+    }
+  }
+  return foo();
+}
diff --git a/llvm/microbenchmarks/spawnunswitch/simple.ll b/llvm/microbenchmarks/spawnunswitch/simple.ll
new file mode 100644
index 00000000000000..05d3ac9fbbd8ec
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/simple.ll
@@ -0,0 +1,41 @@
+; ModuleID = 'simple.c'
+source_filename = "simple.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 (...) @foo() #2
+  %call1 = tail call i32 (...) @bar() #2
+  detach label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %det.achd
+  %call2 = tail call i32 (...) @foo() #2
+  br label %if.end
+
+if.end:                                           ; preds = %det.achd, %if.then
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %if.end, %entry
+  %call3 = tail call i32 (...) @foo() #2
+  ret i32 %call3
+}
+
+declare i32 @foo(...) local_unnamed_addr #1
+
+declare i32 @bar(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/spawnunswitch/simple2.c b/llvm/microbenchmarks/spawnunswitch/simple2.c
new file mode 100644
index 00000000000000..7e376f1522451d
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/simple2.c
@@ -0,0 +1,14 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  if (foo()) {
+    cilk_spawn {
+      bar();
+    }
+  }
+  return foo();
+}
diff --git a/llvm/microbenchmarks/spawnunswitch/simple2.ll b/llvm/microbenchmarks/spawnunswitch/simple2.ll
new file mode 100644
index 00000000000000..a6dfc993f89703
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/simple2.ll
@@ -0,0 +1,37 @@
+; ModuleID = 'simple2.c'
+source_filename = "simple2.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 (...) @foo() #2
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  detach label %det.achd, label %if.end
+
+det.achd:                                         ; preds = %if.then
+  %call1 = tail call i32 (...) @bar() #2
+  reattach label %if.end
+
+if.end:                                           ; preds = %entry, %if.then, %det.achd
+  %call2 = tail call i32 (...) @foo() #2
+  ret i32 %call2
+}
+
+declare i32 @foo(...) local_unnamed_addr #1
+
+declare i32 @bar(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/spawnunswitch/temp.ll b/llvm/microbenchmarks/spawnunswitch/temp.ll
new file mode 100644
index 00000000000000..1484049381dfc4
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/temp.ll
@@ -0,0 +1,38 @@
+; ModuleID = '<stdin>'
+source_filename = "simple.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @SpawnUnswitch_main() local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 (...) @foo() #2
+  %call1 = tail call i32 (...) @bar() #2
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  detach label %if.end, label %det.cont
+
+if.end:                                           ; preds = %det.achd
+  %call2 = tail call i32 (...) @foo() #2
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry, %if.end
+  %call3 = tail call i32 (...) @foo() #2
+  ret i32 %call3
+}
+
+declare i32 @foo(...) local_unnamed_addr #1
+
+declare i32 @bar(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/spawnunswitch/test.c b/llvm/microbenchmarks/spawnunswitch/test.c
new file mode 100644
index 00000000000000..7228775811b839
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/test.c
@@ -0,0 +1,12 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  cilk_for (int i=0; i < 1000; i++) {
+    foo();
+  }
+  return foo();
+}
diff --git a/llvm/microbenchmarks/spawnunswitch/test2.c b/llvm/microbenchmarks/spawnunswitch/test2.c
new file mode 100644
index 00000000000000..56dd3cb7977f61
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/test2.c
@@ -0,0 +1,12 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  cilk_spawn {
+    bar();
+  }
+  return foo();
+}
diff --git a/llvm/microbenchmarks/timing/average.py b/llvm/microbenchmarks/timing/average.py
new file mode 100644
index 00000000000000..17dc85395caa7a
--- /dev/null
+++ b/llvm/microbenchmarks/timing/average.py
@@ -0,0 +1,10 @@
+import sys
+f = open("spawn.txt", 'r')
+g = open("simple.txt", 'r')
+total1 = 0
+for line in f.readlines():
+	total1 += int(line[:len(line)-1])
+total2 = 0
+for line in g.readlines():
+        total2 += int(line[:len(line)-1])
+print "Spawn to serial ratio: " + str((total1*1.0)/total2)
diff --git a/llvm/microbenchmarks/timing/ratio.sh b/llvm/microbenchmarks/timing/ratio.sh
new file mode 100644
index 00000000000000..ac4c6a3e239305
--- /dev/null
+++ b/llvm/microbenchmarks/timing/ratio.sh
@@ -0,0 +1 @@
+for i in {1..100};do ./simple >> simple.txt;./spawn >> spawn.txt;done;python average.py;rm *.txt
diff --git a/llvm/microbenchmarks/timing/simple b/llvm/microbenchmarks/timing/simple
new file mode 100755
index 0000000000000000000000000000000000000000..68c3cd94e6a26ff46dbaa46b1e29eb06846e72f4
GIT binary patch
literal 8480
zcmeHMOK4L;6rEVBrGA<&{GmduM*M)a=t2-76(2sa)v7IsAY+oIHkhO#A8a;KC@!A0
z(5)c25L~(F!i_scp&tahk}fJ_qeRf1Xb~}<GjCq=lB#=QE}5Bo?!9wo=FB{JFZ1@t
zmtX6J2s8;XwnhlihBCcYh`S;LO^6*RHj0#e-Dj<9*5!+|)=Qus{t+?HA_ArCx32cr
zhp7I+dYjM~a}&+FFha^gVzfZbmBxJK%~%XvkF=i{%c;p)F;)Z2moF4!`4v-RzA+HO
zd>eIyhrJFhI^Q~NaP!IGL@Jxe!Pc1XzRtH##}kwH6<IaEc)pNJW^U?$#(dK{pSMP0
zztrW1L&{?2b}}<4lbN9rh3og@na;;Orw}*ha6N9#eXXEKd8WI++vmal)$OadzrKCN
znOn&%WhN!_rF3j0C9{RxU-Nlscs|(Q6Y}q$r<AGazw!mqpY!wb5&QG;+<BaGjM{V1
zbMC@<>&!Xt42K%l2}}v8#S6haT2=NAJZDE$fDkQcTS7u?S01+t0plkks2ImJ(LV~z
z=b(%>f_4K+ABtZX%i>>$-%ouuB!qWX{XRS<YFwW4&SE~-nM%ew2TQ3z9kdfI=TRDZ
zdSlM+dNT1i+WcT%BzcZO9UhQYRStQWHpl_2m;Lx0`i0}1K%tLw4jg+2x&yVbG5(cO
zzK}>CIei54NX8HZ*^k2UvLTnhgBp_`BZdLPfMLKeU>GnA7zPXjhJgXw`CyNKj?7p;
zNydz|>^5KdVq3X`%66>EjI~7bgtc_i`r0<R&DEQ%Wye~Ear{NVv6eo)uU4y-^XxKt
zil3m*Bd6PhGjAWZDo9eclUA9Qt~uKoSb%+^xL|+(=Dc)z7bl8~)1tD4d3%xcjWcVH
zzm8mgHB}AYgRvUEkGf}acM(nZm1;PFc4`=v`1BE@4sW8i1T8$+s=g<7(FgIje2?aP
z_Hnc<{8pt<=QqJd^bD%rwAK`bboEew|2@&I-)D5gfMLKeU>GnA7zPXjh5^HXVZbn8
z7%&VN2L4Y5+B<@mIrH`iFZ<MGxQyX`7?;<Va$-v4g6$C*PmRQH5o`y@;x@W41msFU
z;$Cu%Ob^S%Xd+%LByjf~W%oVlXfh+h?uBG|w+{Cdn%e^E=1~0+u~lLB|BC>0{;z?S
ls*=5^{rGSLc86KHvyFH-gq!5mbCYUrdR^X%RuRB~e*?5*`^^9V

literal 0
HcmV?d00001

diff --git a/llvm/microbenchmarks/timing/simple.c b/llvm/microbenchmarks/timing/simple.c
new file mode 100644
index 00000000000000..c7a90879912060
--- /dev/null
+++ b/llvm/microbenchmarks/timing/simple.c
@@ -0,0 +1,16 @@
+#include <time.h>
+#include <stdio.h>
+
+int main() {
+  int c = 0;
+  int its = 100;
+  clock_t start = clock(), diff;
+  #pragma unroll
+  for (int i = 0; i < its; i++) {
+    c += i;
+  }
+  diff = clock() - start;
+  int msec = (diff * 1000000) / CLOCKS_PER_SEC;
+  printf("%d\n", msec);
+  return c;
+}
diff --git a/llvm/microbenchmarks/timing/spawn b/llvm/microbenchmarks/timing/spawn
new file mode 100755
index 0000000000000000000000000000000000000000..3dc36bce56b1ea1337b8ccdd6eef06535e41886b
GIT binary patch
literal 8480
zcmeHMOKVd>6rR*q@YNJWDprWqh+4FYf`SOCc<~lnA8kbuWN6yd2Gg|Wfi1cy7R76*
z^e1%fstXqiF6skY-RMSf(M2~(3Mwv&77^q3&7Ircr0QOoL+0_FIdf*tH#2Z2^XmJj
zpDTn2ED>U`LI_cbG`3WTn<9*w5F3$fBq>if9Jel5EvKk0l|U){Eux=yFqE>*I^R}m
zqWXtQbwXpzO)S%i7E)&8{aMyrDfc(N7QvwDiEbxGI192=3@!lr%Vcx0%)F}H-whB#
zf2*~HhrJF#?QgkOxc+2!JlPjdLs#x^MEl#O?TN|zyeyhuM<$z2q%LWLa)0-=KQBgN
zzm)h!gOs_{)kLaOCQ@C!3YV_O1MQD%P9ZM$;kLLr*R_Bo<*|mg244%-uVh`l_4TbQ
z_T0R0DN{+A8R&`iCS_kX{YQVE8}1MG_k{fW=P6~f{crw4=+F6i{)qklxbHkpImW^{
zXlgup!aCOI?cs1)oWPW@3SJ2M(N(3d#W~xn41@@x42FfMRxNH82F6cBk<pGZvArLd
z=U^IT6y++U(@1`yEt5YEe>>}wVIjP|>Tkm_QR8x-*XJ_n`eY(j-#L)%)JB_7avpmN
zx6ajEJ6O0IdH#9#^+)Tuk3k&{$V!!Vd6+uL4#dlTJcoYaIERqf#yJP}zXjcag|RVy
z>p&(O@7Z&759X1G!3eS)iR1N!UH*1zOn!_Q1`Gp+0mFb{z%XDKFbw?v44k!{_x8}o
z=!Ery!A@9nZZXRb*TN_=+p!8WRNBt8Jv9EscAiZTvf;Izw_fLihnr_Bhij-(a7pu=
z(>&+ahaLx<;LMw1v8a5gtk6i);Um0;K8zl%6wXV#&MLs)w4Jb~soD3!cFs;hpU+L&
z-@ZDJo#v^0ZfZ;v)}d1#16a?TN!$5&@o6z~8>(XD4ss!yhBse9(a>6qw4)s9rgG*3
z#_wLk+7xP$&Xwv;!5LHhCBLQlt$hF`(~!Oovd(+KYV-`L-T<Xn6Qvl{_rH6(R7#T>
z1`Gp+0mFb{z%XDKFbo(53<HJ%!+>GHFz`<^P*odhVE|Q8KK7}{a2dn<FdnbZq{WCx
zhpM8oBiY+=nV=e=53kYLE+AI|67P}~vZq_d`{NzCY#eXj?d-m#r#+Dp5%)oIe6u$9
zDpXen)XSmzH)4ar?td2n)cKzVN>-I@MefISW!N2N{>?V(;V@p37tc+qx#_sPh!zpR
Gfqwx|BL%4d

literal 0
HcmV?d00001

diff --git a/llvm/microbenchmarks/timing/spawn.c b/llvm/microbenchmarks/timing/spawn.c
new file mode 100644
index 00000000000000..1588cfec2f113d
--- /dev/null
+++ b/llvm/microbenchmarks/timing/spawn.c
@@ -0,0 +1,19 @@
+#include <cilk/cilk.h>
+#include <time.h>
+#include <stdio.h>
+
+int main() {
+  int c = 0;
+  int its = 100;
+  clock_t start = clock(), diff;
+  cilk_spawn {
+    for (int i = 0; i < its; i++) {
+      c += i;
+    }
+  }
+  cilk_sync;
+  diff = clock() - start;
+  int msec = (diff * 1000000) / CLOCKS_PER_SEC;
+  printf("%d\n", msec);
+  return c;
+}
diff --git a/llvm/test/Transforms/LoopFuse/fuse.ll b/llvm/test/Transforms/LoopFuse/fuse.ll
new file mode 100644
index 00000000000000..f283778f432028
--- /dev/null
+++ b/llvm/test/Transforms/LoopFuse/fuse.ll
@@ -0,0 +1,87 @@
+; RUN: opt -loop-fuse -verify-loop-info -verify-dom-info %s -S -o - | FileCheck %s
+
+; 'C' equivalent: Partially generated and hand modified.
+; void fuse(int *a, int *b, int *c) {
+;   for (i = 0; i < 1000; ++i)  // L1
+;     c[i] = a[i] + c[i + 1];
+;   for (i = 0; i < 1000; ++i)  // L2
+;     c[i] = a[i] + b[i];
+; }
+; There is no backward dependence from L1 to L2. So it is safe to fuse.
+
+; Test that there are two versions - original loops and fused loop.
+; CHECK: br i1 %memcheck.conflict, label %entry.split, label %entry.split.L1clone
+
+; Test for fusion along fused path.
+; CHECK: for.body.L1clone:                                 ; preds = %for.body.1.L2clone, %entry.split.L1clone
+; CHECK: for.body.1.L2clone:                               ; preds = %for.body.L1clone
+; CHECK: br i1 %exitcond.L1clone, label %for.end.loopexit.1, label %for.body.L1clone, !llvm.loop !1
+
+; Test for merged defs and its uses outside the loops.
+; CHECK: for.end.loopexit.1:                               ; preds = %for.body.1.L2clone, %for.body.1
+; CHECK: %add11.lfuse = phi i32 [ %add11, %for.body.1 ], [ %add11.L2clone, %for.body.1.L2clone ]
+; CHECK: %add4.lfuse = phi i32 [ %add4, %for.body.1 ], [ %add4.L1clone, %for.body.1.L2clone ]
+; CHECK: %outsideUse = add nsw i32 %add11.lfuse, %add4.lfuse
+
+; ModuleID = '1.bc'
+
+; Function Attrs: norecurse nounwind uwtable
+define void @bigLoop(i32* nocapture readonly %a, i32* nocapture readonly %b, i32* nocapture %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.next
+  %1 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %1, %0
+  %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  store i32 %add4, i32* %arrayidx6, align 4
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !4
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.body.1
+
+for.body.1:                                       ; preds = %for.body.1, %for.end.loopexit
+  %indvars.iv.1 = phi i64 [ 0, %for.end.loopexit ], [ %indvars.iv.next.1, %for.body.1 ]
+  %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.1
+  %3 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %3, %2
+  %arrayidx12 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.1
+  store i32 %add11, i32* %arrayidx12, align 4
+  %indvars.iv.next.1 = add i64 %indvars.iv.1, 1
+  %exitcond.1 = icmp eq i64 %indvars.iv.next.1, 1000
+  br i1 %exitcond.1, label %for.end.loopexit.1, label %for.body.1, !llvm.loop !4
+
+for.end.loopexit.1:                               ; preds = %for.body.1
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit.1
+  %outsideUse = add nsw i32 %add11, %add4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable }
+attributes #1 = { norecurse nounwind readonly uwtable }
+attributes #2 = { nounwind uwtable }
+attributes #3 = { nounwind readonly }
+attributes #4 = { nounwind }
+attributes #5 = { noreturn nounwind }
+attributes #6 = { nounwind readonly }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.8.0"}
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 1}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.unroll.disable"}
+!6 = distinct !{!6, !2, !3}
+!7 = distinct !{!7, !2, !3}
diff --git a/llvm/test/Transforms/LoopFuse/no-fuse.ll b/llvm/test/Transforms/LoopFuse/no-fuse.ll
new file mode 100644
index 00000000000000..7abb67fd622998
--- /dev/null
+++ b/llvm/test/Transforms/LoopFuse/no-fuse.ll
@@ -0,0 +1,78 @@
+; RUN: opt -loop-fuse -verify-loop-info -verify-dom-info %s -S -o - | FileCheck %s
+
+; 'C' equivalent: Partially generated and hand modified.
+; void noFuse(int *a, int *b, int *c) {
+;   for (i = 0; i < 1000; ++i)  // L1
+;     c[i] = a[i] + c[i - 1];
+;   for (i = 0; i < 1000; ++i)  // L2
+;     c[i] = a[i] + b[i];
+; }
+; There is a backward dependence from L1 to L2. So it is unsafe to fuse.
+
+; CHECK: entry:
+; CHECK-NEXT:  br label %for.body
+; CHECK: for.body:                                         ; preds = %for.body, %entry
+; CHECK: for.body.1:
+
+; ModuleID = '1.bc'
+
+; Function Attrs: norecurse nounwind uwtable
+define void @bigLoop(i32* nocapture readonly %a, i32* nocapture readonly %b, i32* nocapture %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %indvars.iv.next.back = add i64 %indvars.iv, -1
+  %arrayidx3 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.next.back
+  %1 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %1, %0
+  %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  store i32 %add4, i32* %arrayidx6, align 4
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !4
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.body.1
+
+for.body.1:                                       ; preds = %for.body.1, %for.end.loopexit
+  %indvars.iv.1 = phi i64 [ 0, %for.end.loopexit ], [ %indvars.iv.next.1, %for.body.1 ]
+  %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.1
+  %3 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %3, %2
+  %arrayidx12 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.1
+  store i32 %add11, i32* %arrayidx12, align 4
+  %indvars.iv.next.1 = add i64 %indvars.iv.1, 1
+  %exitcond.1 = icmp eq i64 %indvars.iv.next.1, 1000
+  br i1 %exitcond.1, label %for.end.loopexit.1, label %for.body.1, !llvm.loop !4
+
+for.end.loopexit.1:                               ; preds = %for.body.1
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit.1
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable }
+attributes #1 = { norecurse nounwind readonly uwtable }
+attributes #2 = { nounwind uwtable }
+attributes #3 = { nounwind readonly }
+attributes #4 = { nounwind }
+attributes #5 = { noreturn nounwind }
+attributes #6 = { nounwind readonly }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.8.0"}
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 1}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.unroll.disable"}
+!6 = distinct !{!6, !2, !3}
+!7 = distinct !{!7, !2, !3}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic1.cpp b/llvm/test/Transforms/Tapir/SyncElimination/basic1.cpp
new file mode 100644
index 00000000000000..0461b69c99b3a1
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/basic1.cpp
@@ -0,0 +1,6 @@
+#include <cilk/cilk.h>
+
+void func() {
+  cilk_sync;
+  cilk_sync;
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic1.ll b/llvm/test/Transforms/Tapir/SyncElimination/basic1.ll
new file mode 100644
index 00000000000000..5615d4c1310d2c
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/basic1.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+
+; ModuleID = 'basic1.cpp'
+source_filename = "basic1.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z4funcv() #0 {
+entry:
+; CHECK: @_Z4funcv
+  %syncreg = call token @llvm.syncregion.start()
+; CHECK-NOT: sync within %syncreg, label %sync.continue
+  sync within %syncreg, label %sync.continue
+
+sync.continue:                                    ; preds = %entry
+; CHECK-NOT: sync within %syncreg, label %sync.continue
+  sync within %syncreg, label %sync.continue1
+
+; CHECK: sync.continue
+sync.continue1:                                   ; preds = %sync.continue
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic2.cpp b/llvm/test/Transforms/Tapir/SyncElimination/basic2.cpp
new file mode 100644
index 00000000000000..6de0ad05f14611
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/basic2.cpp
@@ -0,0 +1,8 @@
+#include <cilk/cilk.h>
+
+void func() {
+  cilk_spawn {
+  }
+  cilk_sync;
+  cilk_sync;
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic2.ll b/llvm/test/Transforms/Tapir/SyncElimination/basic2.ll
new file mode 100644
index 00000000000000..5658771430bc25
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/basic2.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+
+; ModuleID = 'basic2.cpp'
+source_filename = "basic2.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z4funcv() #0 {
+; CHECK: @_Z4funcv
+entry:
+  %syncreg = call token @llvm.syncregion.start()
+  detach within %syncreg, label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  reattach within %syncreg, label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+; CHECK-NOT: sync within %syncreg, label %sync.continue
+  sync within %syncreg, label %sync.continue
+
+; CHECK: sync.continue
+sync.continue:                                    ; preds = %det.cont
+  sync within %syncreg, label %sync.continue1
+
+sync.continue1:                                   ; preds = %sync.continue
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail1.cpp b/llvm/test/Transforms/Tapir/SyncElimination/fail1.cpp
new file mode 100644
index 00000000000000..03c7cb7efdd77d
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/fail1.cpp
@@ -0,0 +1,9 @@
+#include <cilk/cilk.h>
+
+void func() {
+  int a;
+  cilk_spawn {
+    a = 1;
+  }
+  cilk_sync;
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail1.ll b/llvm/test/Transforms/Tapir/SyncElimination/fail1.ll
new file mode 100644
index 00000000000000..0638fc2d81c5b9
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/fail1.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+
+; ModuleID = 'fail1.cpp'
+source_filename = "fail1.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z4funcv() #0 {
+entry:
+  %a = alloca i32, align 4
+  %syncreg = call token @llvm.syncregion.start()
+  detach within %syncreg, label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  store i32 1, i32* %a, align 4
+  reattach within %syncreg, label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+  sync within %syncreg, label %sync.continue
+; CHECK: sync within %syncreg, label %sync.continue
+
+sync.continue:                                    ; preds = %det.cont
+  store i32 2, i32* %a, align 4
+  sync within %syncreg, label %sync.continue1
+; CHECK-NOT: sync within %syncreg, label %sync.continue1
+
+sync.continue1:                                   ; preds = %sync.continue
+  ret void
+; CHECK: ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail2.cpp b/llvm/test/Transforms/Tapir/SyncElimination/fail2.cpp
new file mode 100644
index 00000000000000..779d13b2483954
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/fail2.cpp
@@ -0,0 +1,10 @@
+#include <cilk/cilk.h>
+
+void func(int *a, int *b) {
+  cilk_spawn {
+    *a = 1;
+  }
+  cilk_sync;
+  *b = 2;
+  cilk_sync;
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail2.ll b/llvm/test/Transforms/Tapir/SyncElimination/fail2.ll
new file mode 100644
index 00000000000000..c4d2d395658f34
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/fail2.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+
+; ModuleID = 'fail2.cpp'
+source_filename = "fail2.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z4funcPiS_(i32* %a, i32* %b) #0 {
+entry:
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca i32*, align 8
+  %syncreg = call token @llvm.syncregion.start()
+  store i32* %a, i32** %a.addr, align 8
+  store i32* %b, i32** %b.addr, align 8
+  detach within %syncreg, label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  %0 = load i32*, i32** %a.addr, align 8
+  store i32 1, i32* %0, align 4
+  reattach within %syncreg, label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+  sync within %syncreg, label %sync.continue
+; CHECK: sync within %syncreg, label %sync.continue
+
+sync.continue:                                    ; preds = %det.cont
+  %1 = load i32*, i32** %b.addr, align 8
+  store i32 2, i32* %1, align 4
+  sync within %syncreg, label %sync.continue1
+; CHECK-NOT: sync within %syncreg, label %sync.continue1
+
+sync.continue1:                                   ; preds = %sync.continue
+  ret void
+; CHECK: ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for1.cpp b/llvm/test/Transforms/Tapir/SyncElimination/for1.cpp
new file mode 100644
index 00000000000000..bcf9db1d5e83f3
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/for1.cpp
@@ -0,0 +1,8 @@
+#include <cilk/cilk.h>
+
+void func() {
+  cilk_for (int i = 0; i < 10; i++) {
+  }
+  cilk_for (int i = 0; i < 10; i++) {
+  }
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for1.ll b/llvm/test/Transforms/Tapir/SyncElimination/for1.ll
new file mode 100644
index 00000000000000..394e04b2bc0731
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/for1.ll
@@ -0,0 +1,112 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+
+; ModuleID = 'for1.cpp'
+source_filename = "for1.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z4funcv() #0 {
+entry:
+  %syncreg = call token @llvm.syncregion.start()
+  %__init = alloca i32, align 4
+  %__begin = alloca i32, align 4
+  %__end = alloca i32, align 4
+  %syncreg1 = call token @llvm.syncregion.start()
+  %__init2 = alloca i32, align 4
+  %__begin3 = alloca i32, align 4
+  %__end4 = alloca i32, align 4
+  store i32 0, i32* %__init, align 4
+  store i32 0, i32* %__begin, align 4
+  store i32 10, i32* %__end, align 4
+  br label %pfor.cond
+
+pfor.cond:                                        ; preds = %pfor.inc, %entry
+  %0 = load i32, i32* %__begin, align 4
+  %1 = load i32, i32* %__end, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %pfor.detach, label %pfor.end
+
+pfor.detach:                                      ; preds = %pfor.cond
+  %2 = load i32, i32* %__init, align 4
+  %3 = load i32, i32* %__begin, align 4
+  %mul = mul nsw i32 %3, 1
+  %add = add nsw i32 %2, %mul
+  detach within %syncreg, label %pfor.body.entry, label %pfor.inc
+
+pfor.body.entry:                                  ; preds = %pfor.detach
+  %i = alloca i32, align 4
+  store i32 %add, i32* %i, align 4
+  br label %pfor.body
+
+pfor.body:                                        ; preds = %pfor.body.entry
+  br label %pfor.preattach
+
+pfor.preattach:                                   ; preds = %pfor.body
+  reattach within %syncreg, label %pfor.inc
+
+pfor.inc:                                         ; preds = %pfor.preattach, %pfor.detach
+  %4 = load i32, i32* %__begin, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %__begin, align 4
+  br label %pfor.cond, !llvm.loop !1
+
+pfor.end:                                         ; preds = %pfor.cond
+  sync within %syncreg, label %pfor.end.continue
+
+pfor.end.continue:                                ; preds = %pfor.end
+  store i32 0, i32* %__init2, align 4
+  store i32 0, i32* %__begin3, align 4
+  store i32 10, i32* %__end4, align 4
+  br label %pfor.cond3
+
+; CHECK: pfor.end
+; CHECK-NOT: sync
+; CHECK: pfor.cond
+
+pfor.cond3:                                       ; preds = %pfor.inc8, %pfor.end.continue
+  %5 = load i32, i32* %__begin3, align 4
+  %6 = load i32, i32* %__end4, align 4
+  %cmp6 = icmp slt i32 %5, %6
+  br i1 %cmp6, label %pfor.detach5, label %pfor.end10
+
+pfor.detach5:                                     ; preds = %pfor.cond3
+  %7 = load i32, i32* %__init2, align 4
+  %8 = load i32, i32* %__begin3, align 4
+  %mul8 = mul nsw i32 %8, 1
+  %add9 = add nsw i32 %7, %mul8
+  detach within %syncreg1, label %pfor.body.entry6, label %pfor.inc8
+
+pfor.body.entry6:                                ; preds = %pfor.detach5
+  %i11 = alloca i32, align 4
+  store i32 %add9, i32* %i11, align 4
+  br label %pfor.body6
+
+pfor.body6:                                       ; preds = %pfor.body.entry5
+  br label %pfor.preattach7
+
+pfor.preattach7:                                  ; preds = %pfor.body6
+  reattach within %syncreg1, label %pfor.inc8
+
+pfor.inc8:                                        ; preds = %pfor.preattach7, %pfor.detach5
+  %9 = load i32, i32* %__begin3, align 4
+  %inc15 = add nsw i32 %9, 1
+  store i32 %inc15, i32* %__begin3, align 4
+  br label %pfor.cond3, !llvm.loop !3
+
+pfor.end10:                                       ; preds = %pfor.cond3
+  sync within %syncreg1, label %pfor.end.continue11
+
+pfor.end.continue11:                              ; preds = %pfor.end10
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!1 = distinct !{!1, !2}
+!2 = !{!"tapir.loop.spawn.strategy", i32 1}
+!3 = distinct !{!3, !2}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for2.cpp b/llvm/test/Transforms/Tapir/SyncElimination/for2.cpp
new file mode 100644
index 00000000000000..5627249702cef6
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/for2.cpp
@@ -0,0 +1,8 @@
+#include <cilk/cilk.h>
+
+void func() {
+  cilk_for (int i = 0; i < 100; i++) {
+    cilk_for (int j = 0; j < 3; j++) {
+    }
+  }
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for2.ll b/llvm/test/Transforms/Tapir/SyncElimination/for2.ll
new file mode 100644
index 00000000000000..91b70b4db95e94
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/for2.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+; XFAIL: *
+
+; ModuleID = 'for2.cpp'
+source_filename = "for2.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @_Z4funcv() #0 {
+entry:
+  %syncreg = call token @llvm.syncregion.start()
+  br label %pfor.cond
+
+pfor.cond:                                        ; preds = %pfor.inc15, %entry
+  %__begin.0 = phi i32 [ 0, %entry ], [ %inc16, %pfor.inc15 ]
+  %cmp = icmp slt i32 %__begin.0, 100
+  br i1 %cmp, label %pfor.detach, label %pfor.cond.cleanup
+
+pfor.cond.cleanup:                                ; preds = %pfor.cond
+;; The sync before a return is not safe to remove.
+; CHECK: sync within %syncreg, label %pfor.end.continue
+  sync within %syncreg, label %pfor.end.continue
+
+pfor.end.continue:                                ; preds = %pfor.cond.cleanup
+  ret void
+
+pfor.detach:                                      ; preds = %pfor.cond
+  detach within %syncreg, label %pfor.body.entry, label %pfor.inc15
+
+pfor.body.entry:                                  ; preds = %pfor.detach
+  %syncreg1 = call token @llvm.syncregion.start()
+  br label %pfor.body
+
+pfor.body:                                        ; preds = %pfor.body.entry
+  br label %pfor.cond5
+
+pfor.cond5:                                       ; preds = %pfor.inc, %pfor.body
+  %__begin3.0 = phi i32 [ 0, %pfor.body ], [ %inc, %pfor.inc ]
+  %cmp6 = icmp slt i32 %__begin3.0, 3
+  br i1 %cmp6, label %pfor.detach9, label %pfor.cond.cleanup7
+
+; CHECK: pfor.cond5
+pfor.cond.cleanup7:                               ; preds = %pfor.cond5
+; CHECK-NOT: sync within %syncreg1, label %pfor.end.continue
+  sync within %syncreg1, label %pfor.end.continue8
+; CHECK: pfor.inc15
+
+pfor.end.continue8:                               ; preds = %pfor.cond.cleanup7
+  reattach within %syncreg, label %pfor.inc15
+
+pfor.detach9:                                     ; preds = %pfor.cond5
+  detach within %syncreg1, label %pfor.body.entry12, label %pfor.inc
+
+pfor.body.entry12:                                ; preds = %pfor.detach9
+  br label %pfor.preattach
+
+pfor.preattach:                                   ; preds = %pfor.body.entry12
+  reattach within %syncreg1, label %pfor.inc
+
+pfor.inc:                                         ; preds = %pfor.preattach, %pfor.detach9
+  %inc = add nsw i32 %__begin3.0, 1
+  br label %pfor.cond5, !llvm.loop !2
+
+pfor.inc15:                                       ; preds = %pfor.end.continue8, %pfor.detach
+  %inc16 = add nsw i32 %__begin.0, 1
+  br label %pfor.cond, !llvm.loop !4
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!2 = distinct !{!2, !3}
+!3 = !{!"tapir.loop.spawn.strategy", i32 1}
+!4 = distinct !{!4, !3}
diff --git a/llvm/test/Transforms/Tapir/dac-loopspawning-simple.ll b/llvm/test/Transforms/Tapir/dac-loopspawning-simple.ll
new file mode 100644
index 00000000000000..a31d07f206846d
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/dac-loopspawning-simple.ll
@@ -0,0 +1,98 @@
+; Test that Tapir's loop spawning pass transforms this simple loop
+; into recursive divide-and-conquer.
+
+; RUN: opt < %s -loop-spawning -S | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo(
+entry:
+  %syncreg = call token @llvm.syncregion.start()
+  %cmp5 = icmp sgt i32 %n, 0
+  br i1 %cmp5, label %pfor.detach.preheader, label %pfor.cond.cleanup
+
+pfor.detach.preheader:                            ; preds = %entry
+; CHECK: pfor.detach.preheader:
+; CHECK: [[LIMIT:%[0-9]+]] = add [[TYPE:i[0-9]+]] %n, -1
+; CHECK: call fastcc void @[[OUTLINED:[a-zA-Z0-9._]+]](
+; CHECK: [[TYPE]] 0
+; CHECK: [[TYPE]] [[LIMIT]]
+; CHECK: [[TYPE]] {{[%]?[a-zA-Z0-9._]+}}
+; CHECK-NEXT: br label %pfor.cond.cleanup.loopexit
+  br label %pfor.detach
+
+pfor.cond.cleanup.loopexit:                       ; preds = %pfor.inc
+  br label %pfor.cond.cleanup
+
+pfor.cond.cleanup:                                ; preds = %pfor.cond.cleanup.loopexit, %entry
+; CHECK: pfor.cond.cleanup
+; CHECK-NOT: sync within %syncreg, label %0
+  sync within %syncreg, label %0
+
+; <label>:0:                                      ; preds = %pfor.cond.cleanup
+  ret void
+
+pfor.detach:                                      ; preds = %pfor.detach.preheader, %pfor.inc
+; CHECK: pfor.detach:
+; CHECK: phi i32
+; CHECK-NOT: %pfor.detach.preheader
+; CHECK: detach
+
+; CHECK: define internal fastcc void @[[OUTLINED]](
+; CHECK: [[TYPE]] [[START:%[a-zA-Z0-9._]+]]
+; CHECK: [[TYPE]] [[END:%[a-zA-Z0-9._]+]]
+; CHECK: [[TYPE]] [[GRAIN:%[a-zA-Z0-9._]+]]
+; CHECK: [[NEWSYNCREG:%[a-zA-Z0-9._]+]] = call token @llvm.syncregion.start(
+
+; CHECK: {{^(; <label>:)?}}[[DACSTART:[a-zA-Z0-9._]+]]:
+; CHECK: [[ITERSTART:%[a-zA-Z0-9._]+]] = phi [[TYPE]] [{{.*}}[[START]]{{.*}}]
+; CHECK-NEXT: [[ITERCOUNT:%[a-zA-Z0-9._]+]] = sub [[TYPE]] [[END]], [[ITERSTART]]
+; CHECK-NEXT: [[CMP:%[0-9]+]] = icmp ugt [[TYPE]] [[ITERCOUNT]], [[GRAIN]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[RECUR:[0-9]+]], label %[[BODY:[0-9]+]]
+
+; CHECK: {{^(; <label>:)?}}[[RECUR]]:
+; CHECK-NEXT: [[HALFCOUNT:%[a-zA-Z0-9._]+]] = lshr [[TYPE]] [[ITERCOUNT]], 1
+; CHECK-NEXT: [[MIDITER:%[a-zA-Z0-9._]+]] = add {{.*}} [[TYPE]] [[ITERSTART]], [[HALFCOUNT]]
+; CHECK-NEXT: detach within [[NEWSYNCREG]], label %[[DETACHED:[a-zA-Z0-9._]+]], label %[[CONTINUE:[a-zA-Z0-9._]+]]
+
+; CHECK: {{^(; <label>:)?}}[[DETACHED]]:
+; CHECK-NEXT: call fastcc void @[[OUTLINED]]([[TYPE]] [[ITERSTART]], [[TYPE]] [[MIDITER]], [[TYPE]] [[GRAIN]]
+; CHECK-NEXT: reattach within [[NEWSYNCREG]], label %[[CONTINUE]]
+
+; CHECK: {{^(; <label>:)?}}[[CONTINUE]]:
+; CHECK-NEXT: [[MIDITERP1:%[a-zA-Z0-9._]+]] = add {{.*}} [[TYPE]] [[MIDITER]], 1
+; CHECK-NEXT: br label %[[DACSTART]]
+  %i.06 = phi i32 [ %inc, %pfor.inc ], [ 0, %pfor.detach.preheader ]
+  detach within %syncreg, label %pfor.body, label %pfor.inc
+; CHECK: sync within [[NEWSYNCREG]]
+; CHECK: br label %pfor.body.ls
+
+pfor.body:                                        ; preds = %pfor.detach
+; CHECK: pfor.body.ls:
+  tail call void @bar(i32 %i.06) #2
+; CHECK-NEXT: tail call void @bar(i32 %i.06.ls)
+  reattach within %syncreg, label %pfor.inc
+; CHECK-NEXT: br label %[[INC:[a-zA-Z0-9._]+]]
+
+pfor.inc:                                         ; preds = %pfor.body, %pfor.detach
+; CHECK: {{^(; <label>:)?}}[[INC]]:
+; CHECK-NEXT: [[LOCALCMP:%[0-9]+]] = icmp ult {{.*}} [[LOCALITER:%[a-zA-Z0-9._]+]], [[END]]
+  %inc = add nuw nsw i32 %i.06, 1
+; CHECK-NEXT: add {{.*}} [[LOCALITER]], 1
+  %exitcond = icmp eq i32 %inc, %n
+; CHECK: br i1 [[LOCALCMP]]
+  br i1 %exitcond, label %pfor.cond.cleanup.loopexit, label %pfor.detach, !llvm.loop !1
+}
+
+declare void @bar(i32) local_unnamed_addr #1
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #3
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+attributes #3 = { argmemonly nounwind }
+
+!1 = distinct !{!1, !2}
+!2 = !{!"tapir.loop.spawn.strategy", i32 1}
diff --git a/llvm/test/Transforms/Tapir/looplimit.ll b/llvm/test/Transforms/Tapir/looplimit.ll
new file mode 100644
index 00000000000000..4604eeb36e0ec9
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/looplimit.ll
@@ -0,0 +1,96 @@
+; Test that Tapir's loop spawning pass correctly transforms a loop
+; that reads its original end iteration count.
+
+; RUN: opt < %s -loop-spawning -S | FileCheck %s
+
+source_filename = "looplimittest.c"
+
+@.str = private unnamed_addr constant [13 x i8] c"Limit is %d\0A\00", align 1
+@str = private unnamed_addr constant [9 x i8] c"Starting\00"
+@str.3 = private unnamed_addr constant [9 x i8] c"Finished\00"
+
+; Function Attrs: noinline nounwind uwtable
+define void @foo(i32 %limit) local_unnamed_addr #0 {
+entry:
+  %syncreg = tail call token @llvm.syncregion.start()
+  %cmp9 = icmp slt i32 %limit, 0
+  br i1 %cmp9, label %pfor.cond.cleanup, label %pfor.detach
+
+; CHECK: pfor.detach.preheader:
+; CHECK: call fastcc void @[[OUTLINED:[a-zA-Z0-9._]+]](
+; CHECK: [[TYPE:i[0-9]+]] 0
+; CHECK: [[TYPE]] [[LOOPLIMIT:%[a-zA-Z0-9._]+]]
+; CHECK: [[TYPE]] {{[%]?[a-zA-Z0-9._]+}}
+; CHECK: i32 %limit
+
+pfor.cond.cleanup:                                ; preds = %pfor.inc, %entry
+  sync within %syncreg, label %pfor.end.continue
+
+pfor.end.continue:                                ; preds = %pfor.cond.cleanup
+  ret void
+
+; CHECK: define internal fastcc void @[[OUTLINED]](
+; CHECK: [[TYPE]] [[START:%[a-zA-Z0-9._]+]]
+; CHECK: [[TYPE]] [[END:%[a-zA-Z0-9._]+]]
+; CHECK: [[TYPE]] [[GRAIN:%[a-zA-Z0-9._]+]]
+; CHECK: i32 [[LIMITARG:%[a-zA-Z0-9._]+]]
+
+; CHECK: [[NEWSYNCREG:%[a-zA-Z0-9._]+]] = tail call token @llvm.syncregion.start(
+
+; CHECK: {{^(; <label>:)?}}[[DACSTART:[a-zA-Z0-9._]+]]:
+; CHECK: [[ITERSTART:%[a-zA-Z0-9._]+]] = phi [[TYPE]] [{{.*}}[[START]]{{.*}}]
+; CHECK-NEXT: [[ITERCOUNT:%[a-zA-Z0-9._]+]] = sub [[TYPE]] [[END]], [[ITERSTART]]
+; CHECK-NEXT: [[CMP:%[0-9]+]] = icmp ugt [[TYPE]] [[ITERCOUNT]], [[GRAIN]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[RECUR:[0-9]+]], label %[[BODY:[0-9]+]]
+
+; CHECK: {{^(; <label>:)?}}[[RECUR]]:
+; CHECK-NEXT: [[HALFCOUNT:%[a-zA-Z0-9._]+]] = lshr [[TYPE]] [[ITERCOUNT]], 1
+; CHECK-NEXT: [[MIDITER:%[a-zA-Z0-9._]+]] = add {{.*}} [[TYPE]] [[ITERSTART]], [[HALFCOUNT]]
+; CHECK-NEXT: detach within [[NEWSYNCREG]], label %[[DETACHED:[a-zA-Z0-9._]+]], label %[[CONTINUE:[a-zA-Z0-9._]+]]
+
+; CHECK: {{^(; <label>:)?}}[[DETACHED]]:
+; CHECK-NEXT: call fastcc void @[[OUTLINED]]([[TYPE]] [[ITERSTART]], [[TYPE]] [[MIDITER]], [[TYPE]] [[GRAIN]], i32 [[LIMITARG]]
+; CHECK-NEXT: reattach within [[NEWSYNCREG]], label %[[CONTINUE]]
+
+; CHECK: {{^(; <label>:)?}}[[CONTINUE]]:
+; CHECK-NEXT: [[MIDITERP1:%[a-zA-Z0-9._]+]] = add {{.*}} [[TYPE]] [[MIDITER]], 1
+; CHECK-NEXT: br label %[[DACSTART]]
+
+pfor.detach:                                      ; preds = %entry, %pfor.inc
+  %__begin.010 = phi i32 [ %inc, %pfor.inc ], [ 0, %entry ]
+  detach within %syncreg, label %pfor.body, label %pfor.inc
+
+pfor.body:                                        ; preds = %pfor.detach
+; CHECK: {{^(; <label>:)?}}[[BODY]]:
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0), i32 %limit)
+; CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0), i32 [[LIMITARG]])
+  reattach within %syncreg, label %pfor.inc
+; CHECK: br label %[[INC:[a-zA-Z0-9._]+]]
+
+pfor.inc:                                         ; preds = %pfor.body, %pfor.detach
+; CHECK: {{^(; <label>:)?}}[[INC]]:
+; CHECK-NEXT: [[LOCALCMP:%[0-9]+]] = icmp ult {{.*}} [[LOCALITER:%[a-zA-Z0-9._]+]], [[END]]
+  %inc = add nuw nsw i32 %__begin.010, 1
+; CHECK-NEXT: add {{.*}} [[LOCALITER]], 1
+  %exitcond = icmp eq i32 %__begin.010, %limit
+; CHECK: br i1 [[LOCALCMP]]
+  br i1 %exitcond, label %pfor.cond.cleanup, label %pfor.detach, !llvm.loop !2
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #4
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!2 = distinct !{!2, !3}
+!3 = !{!"tapir.loop.spawn.strategy", i32 1}
diff --git a/llvm/test/Transforms/Tapir/tapir-licm.ll b/llvm/test/Transforms/Tapir/tapir-licm.ll
new file mode 100644
index 00000000000000..0241dfc5dcc7ea
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/tapir-licm.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -licm -S | FileCheck %s
+
+; Function Attrs: noinline nounwind uwtable
+define void @normalize(double* noalias %out, double* noalias %in, i32 %n) #0 {
+; CHECK-LABEL: @normalize(
+entry:
+  %syncreg = call token @llvm.syncregion.start()
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %pfor.detach.lr.ph, label %pfor.end
+
+pfor.detach.lr.ph:                                ; preds = %entry
+; CHECK: pfor.detach.lr.ph:
+; CHECK-NEXT: %call = call double @norm(double* %in, i32 %n)
+  br label %pfor.detach
+
+pfor.detach:                                      ; preds = %pfor.detach.lr.ph, %pfor.inc
+  %i.02 = phi i32 [ 0, %pfor.detach.lr.ph ], [ %inc, %pfor.inc ]
+  detach within %syncreg, label %pfor.body, label %pfor.inc
+
+pfor.body:                                        ; preds = %pfor.detach
+; CHECK-NOT: call double @norm(
+  %idxprom = sext i32 %i.02 to i64
+  %arrayidx = getelementptr inbounds double, double* %in, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  ;; Should have hoisted this call
+  %call = call double @norm(double* %in, i32 %n) #2
+  %div = fdiv double %0, %call
+  %idxprom1 = sext i32 %i.02 to i64
+  %arrayidx2 = getelementptr inbounds double, double* %out, i64 %idxprom1
+  store double %div, double* %arrayidx2, align 8
+  reattach within %syncreg, label %pfor.inc
+
+pfor.inc:                                         ; preds = %pfor.body, %pfor.detach
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %pfor.detach, label %pfor.cond.pfor.end_crit_edge, !llvm.loop !1
+
+pfor.cond.pfor.end_crit_edge:                     ; preds = %pfor.inc
+  br label %pfor.end
+
+pfor.end:                                         ; preds = %pfor.cond.pfor.end_crit_edge, %entry
+  sync within %syncreg, label %pfor.end.continue
+
+pfor.end.continue:                                ; preds = %pfor.end
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare double @norm(double*, i32) #1
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #3
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { argmemonly nounwind }
+
+!1 = distinct !{!1, !2}
+!2 = !{!"tapir.loop.spawn.strategy", i32 1}
diff --git a/llvm/tools/bugpoint/CMakeLists.txt b/llvm/tools/bugpoint/CMakeLists.txt
index 654ecc496a919b..fc43a22385248c 100644
--- a/llvm/tools/bugpoint/CMakeLists.txt
+++ b/llvm/tools/bugpoint/CMakeLists.txt
@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS
   ObjCARCOpts
   ScalarOpts
   Support
+  TapirOpts
   Target
   TransformUtils
   Vectorize
diff --git a/llvm/tools/bugpoint/LLVMBuild.txt b/llvm/tools/bugpoint/LLVMBuild.txt
index 68ecb8c8f4f912..6c9568ffd50640 100644
--- a/llvm/tools/bugpoint/LLVMBuild.txt
+++ b/llvm/tools/bugpoint/LLVMBuild.txt
@@ -30,4 +30,5 @@ required_libraries =
  Linker
  ObjCARC
  Scalar
+ TapirOpts
  all-targets
diff --git a/llvm/tools/bugpoint/bugpoint.cpp b/llvm/tools/bugpoint/bugpoint.cpp
index f6b7d08455d438..ff0001b5223c9f 100644
--- a/llvm/tools/bugpoint/bugpoint.cpp
+++ b/llvm/tools/bugpoint/bugpoint.cpp
@@ -127,6 +127,7 @@ int main(int argc, char **argv) {
   initializeScalarOpts(Registry);
   initializeObjCARCOpts(Registry);
   initializeVectorization(Registry);
+  initializeTapirOpts(Registry);
   initializeIPO(Registry);
   initializeAnalysis(Registry);
   initializeTransformUtils(Registry);
diff --git a/llvm/tools/clang b/llvm/tools/clang
new file mode 160000
index 00000000000000..51d7b71ff6cb4c
--- /dev/null
+++ b/llvm/tools/clang
@@ -0,0 +1 @@
+Subproject commit 51d7b71ff6cb4c026e18ea212e57b979e7b78896
diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt
index f03d11516657f1..071f83eda8bb0c 100644
--- a/llvm/tools/opt/CMakeLists.txt
+++ b/llvm/tools/opt/CMakeLists.txt
@@ -14,6 +14,7 @@ set(LLVM_LINK_COMPONENTS
   ObjCARCOpts
   ScalarOpts
   Support
+  TapirOpts
   Target
   TransformUtils
   Vectorize
diff --git a/llvm/tools/opt/LLVMBuild.txt b/llvm/tools/opt/LLVMBuild.txt
index 047719042de964..defabe2f0f70c7 100644
--- a/llvm/tools/opt/LLVMBuild.txt
+++ b/llvm/tools/opt/LLVMBuild.txt
@@ -28,6 +28,7 @@ required_libraries =
  IPO
  Instrumentation
  Scalar
+ TapirOpts
  ObjCARC
  Passes
  all-targets
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index a4967a234d9cec..3a39e304ca4971 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -446,6 +446,7 @@ int main(int argc, char **argv) {
   initializeScalarOpts(Registry);
   initializeObjCARCOpts(Registry);
   initializeVectorization(Registry);
+  initializeTapirOpts(Registry);
   initializeIPO(Registry);
   initializeAnalysis(Registry);
   initializeTransformUtils(Registry);
diff --git a/llvm/utils/emacs/llvm-mode.el b/llvm/utils/emacs/llvm-mode.el
index cde66d122866f3..fa3000e70f0b54 100644
--- a/llvm/utils/emacs/llvm-mode.el
+++ b/llvm/utils/emacs/llvm-mode.el
@@ -35,7 +35,7 @@
    ;; Unnamed variable slots
    '("%[-]?[0-9]+" . font-lock-variable-name-face)
    ;; Types
-   `(,(regexp-opt '("void" "i1" "i8" "i16" "i32" "i64" "i128" "float" "double" "type" "label" "opaque") 'symbols) . font-lock-type-face)
+   `(,(regexp-opt '("void" "i1" "i8" "i16" "i32" "i64" "i128" "float" "double" "type" "label" "opaque" "token") 'symbols) . font-lock-type-face)
    ;; Integer literals
    '("\\b[-]?[0-9]+\\b" . font-lock-preprocessor-face)
    ;; Floating point constants
@@ -64,9 +64,9 @@
    ;; Floating-point operators
    `(,(regexp-opt '("fadd" "fsub" "fneg" "fmul" "fdiv" "frem") 'symbols) . font-lock-keyword-face)
    ;; Special instructions
-   `(,(regexp-opt '("phi" "tail" "call" "select" "to" "shl" "lshr" "ashr" "fcmp" "icmp" "va_arg" "landingpad") 'symbols) . font-lock-keyword-face)
+   `(,(regexp-opt '("phi" "tail" "call" "select" "to" "shl" "lshr" "ashr" "fcmp" "icmp" "va_arg" "landingpad" "within") 'symbols) . font-lock-keyword-face)
    ;; Control instructions
-   `(,(regexp-opt '("ret" "br" "switch" "invoke" "resume" "unwind" "unreachable" "indirectbr") 'symbols) . font-lock-keyword-face)
+   `(,(regexp-opt '("ret" "br" "switch" "invoke" "resume" "unwind" "unreachable" "indirectbr" "detach" "reattach" "sync") 'symbols) . font-lock-keyword-face)
    ;; Memory operators
    `(,(regexp-opt '("malloc" "alloca" "free" "load" "store" "getelementptr" "fence" "cmpxchg" "atomicrmw") 'symbols) . font-lock-keyword-face)
    ;; Casts