From: Mike Hommey <mh@glandium.org>
Date: Wed, 1 Sep 2010 21:07:45 +0200
Subject: Add nanojit support for ARMv4T

Thanks Albin Tonnerre for the initial patch.
https://bugzilla.mozilla.org/show_bug.cgi?id=586224
https://bugzilla.mozilla.org/show_bug.cgi?id=586625
https://bugzilla.mozilla.org/show_bug.cgi?id=586262
https://bugzilla.mozilla.org/show_bug.cgi?id=585604
https://bugzilla.mozilla.org/show_bug.cgi?id=552624
---
 js/src/nanojit/NativeARM.cpp |  156 ++++++++++++++++++++++--------------------
 js/src/nanojit/NativeARM.h   |    3 +-
 js/src/nanojit/avmplus.cpp   |    7 --
 js/src/nanojit/avmplus.h     |   11 +++-
 js/src/nanojit/njcpudetect.h |  110 +++++++++++++++++++++++++++++
 5 files changed, 202 insertions(+), 85 deletions(-)
 create mode 100644 js/src/nanojit/njcpudetect.h

diff --git a/js/src/nanojit/NativeARM.cpp b/js/src/nanojit/NativeARM.cpp
index 9387191..2333960 100644
--- a/js/src/nanojit/NativeARM.cpp
+++ b/js/src/nanojit/NativeARM.cpp
@@ -42,7 +42,6 @@
 
 #ifdef UNDER_CE
 #include <cmnintrin.h>
-extern "C" bool blx_lr_broken();
 #endif
 
 #if defined(AVMPLUS_LINUX)
@@ -109,44 +108,14 @@ Assembler::decOp2Imm(uint32_t enc)
 #endif
 
 // Calculate the number of leading zeroes in data.
-inline uint32_t
-Assembler::CountLeadingZeroes(uint32_t data)
+static inline uint32_t
+CountLeadingZeroesSlow(uint32_t data)
 {
-    uint32_t    leading_zeroes;
-
-    // We can't do CLZ on anything earlier than ARMv5. Architectures as early
-    // as that aren't supported, but assert that we aren't running on one
-    // anyway.
-    // If ARMv4 support is required in the future for some reason, we can do a
-    // run-time check on config.arch and fall back to the C routine, but for
-    // now we can avoid the cost of the check as we don't intend to support
-    // ARMv4 anyway.
-    NanoAssert(ARM_ARCH >= 5);
-
-#if defined(__ARMCC__)
-    // ARMCC can do this with an intrinsic.
-    leading_zeroes = __clz(data);
-
-// current Android GCC compiler incorrectly refuses to compile 'clz' for armv5
-// (even though this is a legal instruction there). Since we currently only compile for ARMv5
-// for emulation, we don't care too much (but we DO care for ARMv6+ since those are "real"
-// devices).
-#elif defined(__GNUC__) && !(defined(ANDROID) && __ARM_ARCH__ <= 5) 
-    // GCC can use inline assembler to insert a CLZ instruction.
-    __asm (
-        "   clz     %0, %1  \n"
-        :   "=r"    (leading_zeroes)
-        :   "r"     (data)
-    );
-#elif defined(WINCE)
-    // WinCE can do this with an intrinsic.
-    leading_zeroes = _CountLeadingZeros(data);
-#else
     // Other platforms must fall back to a C routine. This won't be as
     // efficient as the CLZ instruction, but it is functional.
     uint32_t    try_shift;
 
-    leading_zeroes = 0;
+    uint32_t    leading_zeroes = 0;
 
     // This loop does a bisection search rather than the obvious rotation loop.
     // This should be faster, though it will still be no match for CLZ.
@@ -156,6 +125,43 @@ Assembler::CountLeadingZeroes(uint32_t data)
             leading_zeroes = shift;
         }
     }
+
+    return leading_zeroes;
+}
+
+inline uint32_t
+Assembler::CountLeadingZeroes(uint32_t data)
+{
+    uint32_t    leading_zeroes;
+
+#if defined(__ARMCC__)
+    // ARMCC can do this with an intrinsic.
+    leading_zeroes = __clz(data);
+#elif defined(__GNUC__)
+    // GCC can use inline assembler to insert a CLZ instruction.
+    if (ARM_ARCH_AT_LEAST(5)) {
+        __asm (
+#if defined(ANDROID) && (NJ_COMPILER_ARM_ARCH < 7)
+        // On Android gcc compiler, the clz instruction is not supported with a
+        // target smaller than armv7, despite it being legal for armv5+.
+            "   .arch armv7-a\n"
+#elif (NJ_COMPILER_ARM_ARCH < 5)
+        // Targetting armv5t allows a toolchain with armv4t target to still build
+        // with clz, and clz to be used when appropriate at runtime.
+            "   .arch armv5t\n"
+#endif
+            "   clz     %0, %1  \n"
+            :   "=r"    (leading_zeroes)
+            :   "r"     (data)
+        );
+    } else {
+        leading_zeroes = CountLeadingZeroesSlow(data);
+    }
+#elif defined(UNDER_CE)
+    // WinCE can do this with an intrinsic.
+    leading_zeroes = _CountLeadingZeros(data);
+#else
+    leading_zeroes = CountLeadingZeroesSlow(data);
 #endif
 
     // Assert that the operation worked!
@@ -462,11 +468,6 @@ Assembler::asm_eor_imm(Register rd, Register rn, int32_t imm, int stat /* =0 */)
 void
 Assembler::nInit(AvmCore*)
 {
-#ifdef UNDER_CE
-    blx_lr_bug = blx_lr_broken();
-#else
-    blx_lr_bug = 0;
-#endif
 }
 
 void Assembler::nBeginAssembly()
@@ -554,12 +555,18 @@ Assembler::nFragExit(LInsp guard)
 NIns*
 Assembler::genEpilogue()
 {
-    // On ARMv5+, loading directly to PC correctly handles interworking.
-    // Note that we don't support anything older than ARMv5.
-    NanoAssert(ARM_ARCH >= 5);
+    RegisterMask savingMask;
 
-    RegisterMask savingMask = rmask(FP) | rmask(PC);
+    if (ARM_ARCH_AT_LEAST(5)) {
+        // On ARMv5+, loading directly to PC correctly handles interworking.
+        savingMask = rmask(FP) | rmask(PC);
 
+    } else {
+        // On ARMv4T, interworking is not handled properly, therefore, we pop
+        // lr and use bx lr to avoid that.
+        savingMask = rmask(FP) | rmask(LR);
+        BX(LR);
+    }
     POP_mask(savingMask); // regs
 
     return _nIns;
@@ -867,25 +874,23 @@ Assembler::asm_call(LInsp ins)
             outputf("        %p:", _nIns);
         )
 
-        // Direct call: on v5 and above (where the calling sequence doesn't
-        // corrupt LR until the actual branch instruction), we can avoid an
-        // interlock in the "long" branch sequence by manually loading the
-        // target address into LR ourselves before setting up the parameters
-        // in other registers.
         BranchWithLink((NIns*)call->_address);
     } else {
-        // Indirect call: we assign the address arg to LR since it's not
-        // used for regular arguments, and is otherwise scratch since it's
-        // clobberred by the call. On v4/v4T, where we have to manually do
-        // the equivalent of a BLX, move LR into IP before corrupting LR
-        // with the return address.
-        if (blx_lr_bug) {
+        // Indirect call: we assign the address arg to LR
+        if (ARM_ARCH_AT_LEAST(5)) {
+#ifndef UNDER_CE
             // workaround for msft device emulator bug (blx lr emulated as no-op)
             underrunProtect(8);
             BLX(IP);
-            MOV(IP,LR);
-        } else {
+            MOV(IP, LR);
+#else
             BLX(LR);
+#endif
+        } else {
+            underrunProtect(12);
+            BX(IP);
+            MOV(LR, PC);
+            MOV(IP, LR);
         }
         asm_regarg(ARGSIZE_LO, ins->arg(--argc), LR);
     }
@@ -1494,7 +1499,7 @@ Assembler::BranchWithLink(NIns* addr)
     // reserve enough space for the LDR sequence. This should give us a slight
     // net gain over reserving the exact amount required for shorter branches.
     // This _must_ be called before PC_OFFSET_FROM as it can move _nIns!
-    underrunProtect(4+LD32_size);
+    underrunProtect(8+LD32_size);
 
     // Calculate the offset from the instruction that is about to be
     // written (at _nIns-1) to the target.
@@ -1513,29 +1518,30 @@ Assembler::BranchWithLink(NIns* addr)
             // BL target
             *(--_nIns) = (NIns)( (COND_AL) | (0xB<<24) | (offs2) );
             asm_output("bl %p", (void*)addr);
-        } else {
-            // The target is Thumb, so emit a BLX.
-
-            // We need to emit an ARMv5+ instruction, so assert that we have a
-            // suitable processor. Note that we don't support ARMv4(T), but
-            // this serves as a useful sanity check.
-            NanoAssert(ARM_ARCH >= 5);
-
+            return;
+        } else if (ARM_ARCH_AT_LEAST(5)) {
+            // The target is Thumb, so emit a BLX (ARMv5+)
             // The (pre-shifted) value of the "H" bit in the BLX encoding.
             uint32_t    H = (offs & 0x2) << 23;
 
             // BLX addr
             *(--_nIns) = (NIns)( (0xF << 28) | (0x5<<25) | (H) | (offs2) );
             asm_output("blx %p", (void*)addr);
+            return;
         }
-    } else {
+        /* If we get here, it means we are on ARMv4T, and the target is Thumb,
+           in which case we want to emit a branch with a register */
+    }
+    if (ARM_ARCH_AT_LEAST(5)) {
         // Load the target address into IP and branch to that. We've already
         // done underrunProtect, so we can skip that here.
         BLX(IP, false);
-
-        // LDR IP, =addr
-        asm_ld_imm(IP, (int32_t)addr, false);
+    } else {
+        BX(IP);
+        MOV(LR, PC);
     }
+    // LDR IP, =addr
+    asm_ld_imm(IP, (int32_t)addr, false);
 }
 
 // This is identical to BranchWithLink(NIns*) but emits a branch to an address
@@ -1546,20 +1552,22 @@ Assembler::BLX(Register addr, bool chk /* = true */)
     // We need to emit an ARMv5+ instruction, so assert that we have a suitable
     // processor. Note that we don't support ARMv4(T), but this serves as a
     // useful sanity check.
-    NanoAssert(ARM_ARCH >= 5);
+    NanoAssert(ARM_ARCH_AT_LEAST(5));
 
     NanoAssert(IsGpReg(addr));
+#ifdef UNDER_CE
     // There is a bug in the WinCE device emulator which stops "BLX LR" from
     // working as expected. Assert that we never do that!
-    if (blx_lr_bug) { NanoAssert(addr != LR); }
+    NanoAssert(addr != LR);
+#endif
 
     if (chk) {
         underrunProtect(4);
     }
 
-    // BLX IP
+    // BLX reg
     *(--_nIns) = (NIns)( (COND_AL) | (0x12<<20) | (0xFFF<<8) | (0x3<<4) | (addr) );
-    asm_output("blx ip");
+    asm_output("blx %s", gpn(addr));
 }
 
 // Emit the code required to load a memory address into a register as follows:
@@ -2177,7 +2185,7 @@ Assembler::asm_arith(LInsp ins)
             // common for (rr == ra) and is thus likely to be the most
             // efficient case; if ra is no longer used after this LIR
             // instruction, it is re-used for the result register (rr).
-            if ((ARM_ARCH > 5) || (rr != rb)) {
+            if ((ARM_ARCH_AT_LEAST(6)) || (rr != rb)) {
                 // Newer cores place no restrictions on the registers used in a
                 // MUL instruction (compared to other arithmetic instructions).
                 MUL(rr, rb, ra);
diff --git a/js/src/nanojit/NativeARM.h b/js/src/nanojit/NativeARM.h
index 55b2e8e..a0b0b87 100644
--- a/js/src/nanojit/NativeARM.h
+++ b/js/src/nanojit/NativeARM.h
@@ -229,7 +229,6 @@ verbose_only( extern const char* shiftNames[]; )
     int *       _nSlot;                                                         \
     int *       _startingSlot;                                                  \
     int *       _nExitSlot;                                                     \
-    bool        blx_lr_bug;                                                     \
     int         max_out_args; /* bytes */                                      
 
 //nj_dprintf("jmp_l_n count=%d, nins=%X, %X = %X\n", (_c), nins, _nIns, ((intptr_t)(nins+(_c))-(intptr_t)_nIns - 4) );
@@ -461,7 +460,7 @@ enum {
 // _d = _l * _r
 #define MUL(_d,_l,_r)  do {                                  \
         underrunProtect(4);                                                 \
-        NanoAssert((ARM_ARCH >= 6) || ((_d) != (_l)));                      \
+        NanoAssert((ARM_ARCH_AT_LEAST(6)) || ((_d) != (_l)));                      \
         NanoAssert(IsGpReg(_d) && IsGpReg(_l) && IsGpReg(_r));              \
         NanoAssert(((_d) != PC) && ((_l) != PC) && ((_r) != PC));           \
         *(--_nIns) = (NIns)( COND_AL | (_d)<<16 | (_r)<<8 | 0x90 | (_l) );  \
diff --git a/js/src/nanojit/avmplus.cpp b/js/src/nanojit/avmplus.cpp
index ab84abd..f436c1e 100644
--- a/js/src/nanojit/avmplus.cpp
+++ b/js/src/nanojit/avmplus.cpp
@@ -45,13 +45,6 @@
     typedef void *maddr_ptr;
 #endif
 
-#if defined(AVMPLUS_ARM) && defined(UNDER_CE)
-extern "C" bool
-blx_lr_broken() {
-    return false;
-}
-#endif
-
 using namespace avmplus;
 
 Config AvmCore::config;
diff --git a/js/src/nanojit/avmplus.h b/js/src/nanojit/avmplus.h
index ffc0873..76370f5 100644
--- a/js/src/nanojit/avmplus.h
+++ b/js/src/nanojit/avmplus.h
@@ -50,11 +50,18 @@
 #include "jstypes.h"
 #include "jsstdint.h"
 
+#include "njcpudetect.h"
+
 #ifdef AVMPLUS_ARM
-#define ARM_ARCH   config.arch
+#ifdef DEBUG
+#define ARM_ARCH_AT_LEAST(wanted) (config.arch >= (wanted))
 #define ARM_VFP    config.vfp
+#else
+#define ARM_ARCH_AT_LEAST(wanted) \
+    ((NJ_COMPILER_ARM_ARCH >= (wanted)) || (config.arch >= (wanted)))
+#define ARM_VFP ((NJ_COMPILER_ARM_ARCH >= 7) || (config.vfp))
+#endif
 #define ARM_THUMB2 config.thumb2
-
 #endif
 
 #if !defined(AVMPLUS_LITTLE_ENDIAN) && !defined(AVMPLUS_BIG_ENDIAN)
diff --git a/js/src/nanojit/njcpudetect.h b/js/src/nanojit/njcpudetect.h
new file mode 100644
index 0000000..1143005
--- /dev/null
+++ b/js/src/nanojit/njcpudetect.h
@@ -0,0 +1,110 @@
+/* -*- Mode: C++; c-basic-offset: 4; indent-tabs-mode: nil; tab-width: 4 -*- */
+/* vi: set ts=4 sw=4 expandtab: (add to ~/.vimrc: set modeline modelines=5) */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is [Open Source Virtual Machine].
+ *
+ * The Initial Developer of the Original Code is
+ * Adobe System Incorporated.
+ * Portions created by the Initial Developer are Copyright (C) 2004-2007
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *   Adobe AS3 Team
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#ifndef __njcpudetect__
+#define __njcpudetect__
+
+/***
+ * Note: this file should not include *any* other files, nor should it wrap
+ * itself in ifdef FEATURE_NANOJIT, nor should it do anything other than
+ * define preprocessor symbols.
+ */
+
+/***
+ * NJ_COMPILER_ARM_ARCH attempts to specify the minimum ARM architecture
+ * that the C++ compiler has specified. Note that although Config::arm_arch
+ * is initialized to this value by default, there is no requirement that they
+ * be in sync.
+ *
+ * Note, this is done via #define so that downstream preprocessor usage can
+ * examine it, but please don't attempt to redefine it.
+ *
+ * Note, this is deliberately not encased in "ifdef NANOJIT_ARM", as this file
+ * may be included before that is defined. On non-ARM platforms we will hit the
+ * "Unable to determine" case.
+ */
+
+// GCC and RealView usually define __ARM_ARCH__
+#if defined(__ARM_ARCH__)
+
+    #define NJ_COMPILER_ARM_ARCH __ARM_ARCH__
+
+// ok, try well-known GCC flags ( see http://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html )
+#elif     defined(__ARM_ARCH_7__) || \
+        defined(__ARM_ARCH_7A__) || \
+        defined(__ARM_ARCH_7M__) || \
+        defined(__ARM_ARCH_7R__) || \
+        defined(_ARM_ARCH_7)
+
+    #define NJ_COMPILER_ARM_ARCH 7
+
+#elif   defined(__ARM_ARCH_6__) || \
+        defined(__ARM_ARCH_6J__) || \
+        defined(__ARM_ARCH_6T2__) || \
+        defined(__ARM_ARCH_6Z__) || \
+        defined(__ARM_ARCH_6ZK__) || \
+        defined(__ARM_ARCH_6M__) || \
+        defined(_ARM_ARCH_6)
+
+    #define NJ_COMPILER_ARM_ARCH 6
+
+#elif   defined(__ARM_ARCH_5__) || \
+        defined(__ARM_ARCH_5T__) || \
+        defined(__ARM_ARCH_5E__) || \
+        defined(__ARM_ARCH_5TE__)
+
+    #define NJ_COMPILER_ARM_ARCH 5
+
+#elif   defined(__ARM_ARCH_4T__)
+
+    #define NJ_COMPILER_ARM_ARCH 4
+
+// Visual C has its own mojo
+#elif defined(_MSC_VER) && defined(_M_ARM)
+
+    #define NJ_COMPILER_ARM_ARCH _M_ARM
+
+#else
+
+    // non-numeric value
+    #define NJ_COMPILER_ARM_ARCH "Unable to determine valid NJ_COMPILER_ARM_ARCH (nanojit only supports ARMv4T or later)"
+
+#endif
+
+#endif // __njcpudetect__