guest_arm64_toIR.c - Android社区 - https://www.androidos.net.cn/

/* -*- mode: C; c-basic-offset: 3; -*- */

/*--------------------------------------------------------------------*/
/*--- begin                                     guest_arm64_toIR.c ---*/
/*--------------------------------------------------------------------*/

/*
   This file is part of Valgrind, a dynamic binary instrumentation
   framework.

This program is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as
   published by the Free Software Foundation; either version 2 of the
   License, or (at your option) any later version.

This program is distributed in the hope that it will be useful, but
   WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   General Public License for more details.

You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
   02110-1301, USA.

The GNU General Public License is contained in the file COPYING.
*/

/* KNOWN LIMITATIONS 2014-Nov-16

* Correctness: FMAXNM, FMINNM are implemented the same as FMAX/FMIN.

Also FP comparison "unordered" .. is implemented as normal FP
     comparison.

Both should be fixed.  They behave incorrectly in the presence of
     NaNs.

FMULX is treated the same as FMUL.  That's also not correct.

* Floating multiply-add (etc) insns.  Are split into a multiply and 
     an add, and so suffer double rounding and hence sometimes the
     least significant mantissa bit is incorrect.  Fix: use the IR
     multiply-add IROps instead.

* FRINTA, FRINTN are kludged .. they just round to nearest.  No special
     handling for the "ties" case.  FRINTX might be dubious too.

* Ditto FCVTXN.  No idea what "round to odd" means.  This implementation
     just rounds to nearest.
*/

/* "Special" instructions.

This instruction decoder can decode four special instructions
   which mean nothing natively (are no-ops as far as regs/mem are
   concerned) but have meaning for supporting Valgrind.  A special
   instruction is flagged by a 16-byte preamble:

93CC0D8C 93CC358C 93CCCD8C 93CCF58C
      (ror x12, x12, #3;   ror x12, x12, #13
       ror x12, x12, #51;  ror x12, x12, #61)

Following that, one of the following 3 are allowed
   (standard interpretation in parentheses):

AA0A014A (orr x10,x10,x10)   X3 = client_request ( X4 )
      AA0B016B (orr x11,x11,x11)   X3 = guest_NRADDR
      AA0C018C (orr x12,x12,x12)   branch-and-link-to-noredir X8
      AA090129 (orr x9,x9,x9)      IR injection

Any other bytes following the 16-byte preamble are illegal and
   constitute a failure in instruction decoding.  This all assumes
   that the preamble will never occur except in specific code
   fragments designed for Valgrind to catch.
*/

/* Translates ARM64 code to IR. */

#include "libvex_basictypes.h"
#include "libvex_ir.h"
#include "libvex.h"
#include "libvex_guest_arm64.h"

#include "main_util.h"
#include "main_globals.h"
#include "guest_generic_bb_to_IR.h"
#include "guest_arm64_defs.h"

/*------------------------------------------------------------*/
/*--- Globals                                              ---*/
/*------------------------------------------------------------*/

/* These are set at the start of the translation of a instruction, so
   that we don't have to pass them around endlessly.  CONST means does
   not change during translation of the instruction.
*/

/* CONST: what is the host's endianness?  We need to know this in
   order to do sub-register accesses to the SIMD/FP registers
   correctly. */
static VexEndness host_endness;

/* CONST: The guest address for the instruction currently being
   translated.  */
static Addr64 guest_PC_curr_instr;

/* MOD: The IRSB* into which we're generating code. */
static IRSB* irsb;

/*------------------------------------------------------------*/
/*--- Debugging output                                     ---*/
/*------------------------------------------------------------*/

#define DIP(format, args...)           \
   if (vex_traceflags & VEX_TRACE_FE)  \
      vex_printf(format, ## args)

#define DIS(buf, format, args...)      \
   if (vex_traceflags & VEX_TRACE_FE)  \
      vex_sprintf(buf, format, ## args)

/*------------------------------------------------------------*/
/*--- Helper bits and pieces for deconstructing the        ---*/
/*--- arm insn stream.                                     ---*/
/*------------------------------------------------------------*/

/* Do a little-endian load of a 32-bit word, regardless of the
   endianness of the underlying host. */
static inline UInt getUIntLittleEndianly ( const UChar* p )
{
   UInt w = 0;
   w = (w << 8) | p[3];
   w = (w << 8) | p[2];
   w = (w << 8) | p[1];
   w = (w << 8) | p[0];
   return w;
}

/* Sign extend a N-bit value up to 64 bits, by copying
   bit N-1 into all higher positions. */
static ULong sx_to_64 ( ULong x, UInt n )
{
   vassert(n > 1 && n < 64);
   Long r = (Long)x;
   r = (r << (64-n)) >> (64-n);
   return (ULong)r;
}

//ZZ /* Do a little-endian load of a 16-bit word, regardless of the
//ZZ    endianness of the underlying host. */
//ZZ static inline UShort getUShortLittleEndianly ( UChar* p )
//ZZ {
//ZZ    UShort w = 0;
//ZZ    w = (w << 8) | p[1];
//ZZ    w = (w << 8) | p[0];
//ZZ    return w;
//ZZ }
//ZZ 
//ZZ static UInt ROR32 ( UInt x, UInt sh ) {
//ZZ    vassert(sh >= 0 && sh < 32);
//ZZ    if (sh == 0)
//ZZ       return x;
//ZZ    else
//ZZ       return (x << (32-sh)) | (x >> sh);
//ZZ }
//ZZ 
//ZZ static Int popcount32 ( UInt x )
//ZZ {
//ZZ    Int res = 0, i;
//ZZ    for (i = 0; i < 32; i++) {
//ZZ       res += (x & 1);
//ZZ       x >>= 1;
//ZZ    }
//ZZ    return res;
//ZZ }
//ZZ 
//ZZ static UInt setbit32 ( UInt x, Int ix, UInt b )
//ZZ {
//ZZ    UInt mask = 1 << ix;
//ZZ    x &= ~mask;
//ZZ    x |= ((b << ix) & mask);
//ZZ    return x;
//ZZ }

#define BITS2(_b1,_b0)  \
   (((_b1) << 1) | (_b0))

#define BITS3(_b2,_b1,_b0)  \
  (((_b2) << 2) | ((_b1) << 1) | (_b0))

#define BITS4(_b3,_b2,_b1,_b0)  \
   (((_b3) << 3) | ((_b2) << 2) | ((_b1) << 1) | (_b0))

#define BITS8(_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
   ((BITS4((_b7),(_b6),(_b5),(_b4)) << 4)  \
    | BITS4((_b3),(_b2),(_b1),(_b0)))

#define BITS5(_b4,_b3,_b2,_b1,_b0)  \
   (BITS8(0,0,0,(_b4),(_b3),(_b2),(_b1),(_b0)))
#define BITS6(_b5,_b4,_b3,_b2,_b1,_b0)  \
   (BITS8(0,0,(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))
#define BITS7(_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
   (BITS8(0,(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))

#define BITS9(_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
   (((_b8) << 8)  \
    | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))

#define BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
   (((_b9) << 9) | ((_b8) << 8)  \
    | BITS8((_b7),(_b6),(_b5),(_b4),(_b3),(_b2),(_b1),(_b0)))

#define BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0)  \
   (((_b10) << 10)  \
    | BITS10(_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))

#define BITS12(_b11, _b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0) \
   (((_b11) << 11)  \
    | BITS11(_b10,_b9,_b8,_b7,_b6,_b5,_b4,_b3,_b2,_b1,_b0))

#define X00 BITS2(0,0)
#define X01 BITS2(0,1)
#define X10 BITS2(1,0)
#define X11 BITS2(1,1)

// produces _uint[_bMax:_bMin]
#define SLICE_UInt(_uint,_bMax,_bMin)  \
   (( ((UInt)(_uint)) >> (_bMin))  \
    & (UInt)((1ULL << ((_bMax) - (_bMin) + 1)) - 1ULL))

/*------------------------------------------------------------*/
/*--- Helper bits and pieces for creating IR fragments.    ---*/
/*------------------------------------------------------------*/

static IRExpr* mkV128 ( UShort w )
{
   return IRExpr_Const(IRConst_V128(w));
}

static IRExpr* mkU64 ( ULong i )
{
   return IRExpr_Const(IRConst_U64(i));
}

static IRExpr* mkU32 ( UInt i )
{
   return IRExpr_Const(IRConst_U32(i));
}

static IRExpr* mkU16 ( UInt i )
{
   vassert(i < 65536);
   return IRExpr_Const(IRConst_U16(i));
}

static IRExpr* mkU8 ( UInt i )
{
   vassert(i < 256);
   return IRExpr_Const(IRConst_U8( (UChar)i ));
}

static IRExpr* mkexpr ( IRTemp tmp )
{
   return IRExpr_RdTmp(tmp);
}

static IRExpr* unop ( IROp op, IRExpr* a )
{
   return IRExpr_Unop(op, a);
}

static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
{
   return IRExpr_Binop(op, a1, a2);
}

static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
{
   return IRExpr_Triop(op, a1, a2, a3);
}

static IRExpr* loadLE ( IRType ty, IRExpr* addr )
{
   return IRExpr_Load(Iend_LE, ty, addr);
}

/* Add a statement to the list held by "irbb". */
static void stmt ( IRStmt* st )
{
   addStmtToIRSB( irsb, st );
}

static void assign ( IRTemp dst, IRExpr* e )
{
   stmt( IRStmt_WrTmp(dst, e) );
}

static void storeLE ( IRExpr* addr, IRExpr* data )
{
   stmt( IRStmt_Store(Iend_LE, addr, data) );
}

//ZZ static void storeGuardedLE ( IRExpr* addr, IRExpr* data, IRTemp guardT )
//ZZ {
//ZZ    if (guardT == IRTemp_INVALID) {
//ZZ       /* unconditional */
//ZZ       storeLE(addr, data);
//ZZ    } else {
//ZZ       stmt( IRStmt_StoreG(Iend_LE, addr, data,
//ZZ                           binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
//ZZ    }
//ZZ }
//ZZ 
//ZZ static void loadGuardedLE ( IRTemp dst, IRLoadGOp cvt,
//ZZ                             IRExpr* addr, IRExpr* alt, 
//ZZ                             IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
//ZZ {
//ZZ    if (guardT == IRTemp_INVALID) {
//ZZ       /* unconditional */
//ZZ       IRExpr* loaded = NULL;
//ZZ       switch (cvt) {
//ZZ          case ILGop_Ident32:
//ZZ             loaded = loadLE(Ity_I32, addr); break;
//ZZ          case ILGop_8Uto32:
//ZZ             loaded = unop(Iop_8Uto32, loadLE(Ity_I8, addr)); break;
//ZZ          case ILGop_8Sto32:
//ZZ             loaded = unop(Iop_8Sto32, loadLE(Ity_I8, addr)); break;
//ZZ          case ILGop_16Uto32:
//ZZ             loaded = unop(Iop_16Uto32, loadLE(Ity_I16, addr)); break;
//ZZ          case ILGop_16Sto32:
//ZZ             loaded = unop(Iop_16Sto32, loadLE(Ity_I16, addr)); break;
//ZZ          default:
//ZZ             vassert(0);
//ZZ       }
//ZZ       vassert(loaded != NULL);
//ZZ       assign(dst, loaded);
//ZZ    } else {
//ZZ       /* Generate a guarded load into 'dst', but apply 'cvt' to the
//ZZ          loaded data before putting the data in 'dst'.  If the load
//ZZ          does not take place, 'alt' is placed directly in 'dst'. */
//ZZ       stmt( IRStmt_LoadG(Iend_LE, cvt, dst, addr, alt,
//ZZ                          binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0))) );
//ZZ    }
//ZZ }

/* Generate a new temporary of the given type. */
static IRTemp newTemp ( IRType ty )
{
   vassert(isPlausibleIRType(ty));
   return newIRTemp( irsb->tyenv, ty );
}

/* This is used in many places, so the brevity is an advantage. */
static IRTemp newTempV128(void)
{
   return newTemp(Ity_V128);
}

/* Initialise V128 temporaries en masse. */
static
void newTempsV128_2(IRTemp* t1, IRTemp* t2)
{
   vassert(t1 && *t1 == IRTemp_INVALID);
   vassert(t2 && *t2 == IRTemp_INVALID);
   *t1 = newTempV128();
   *t2 = newTempV128();
}

static
void newTempsV128_3(IRTemp* t1, IRTemp* t2, IRTemp* t3)
{
   vassert(t1 && *t1 == IRTemp_INVALID);
   vassert(t2 && *t2 == IRTemp_INVALID);
   vassert(t3 && *t3 == IRTemp_INVALID);
   *t1 = newTempV128();
   *t2 = newTempV128();
   *t3 = newTempV128();
}

static
void newTempsV128_4(IRTemp* t1, IRTemp* t2, IRTemp* t3, IRTemp* t4)
{
   vassert(t1 && *t1 == IRTemp_INVALID);
   vassert(t2 && *t2 == IRTemp_INVALID);
   vassert(t3 && *t3 == IRTemp_INVALID);
   vassert(t4 && *t4 == IRTemp_INVALID);
   *t1 = newTempV128();
   *t2 = newTempV128();
   *t3 = newTempV128();
   *t4 = newTempV128();
}

static
void newTempsV128_7(IRTemp* t1, IRTemp* t2, IRTemp* t3,
                    IRTemp* t4, IRTemp* t5, IRTemp* t6, IRTemp* t7)
{
   vassert(t1 && *t1 == IRTemp_INVALID);
   vassert(t2 && *t2 == IRTemp_INVALID);
   vassert(t3 && *t3 == IRTemp_INVALID);
   vassert(t4 && *t4 == IRTemp_INVALID);
   vassert(t5 && *t5 == IRTemp_INVALID);
   vassert(t6 && *t6 == IRTemp_INVALID);
   vassert(t7 && *t7 == IRTemp_INVALID);
   *t1 = newTempV128();
   *t2 = newTempV128();
   *t3 = newTempV128();
   *t4 = newTempV128();
   *t5 = newTempV128();
   *t6 = newTempV128();
   *t7 = newTempV128();
}

//ZZ /* Produces a value in 0 .. 3, which is encoded as per the type
//ZZ    IRRoundingMode. */
//ZZ static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
//ZZ {
//ZZ    return mkU32(Irrm_NEAREST);
//ZZ }
//ZZ 
//ZZ /* Generate an expression for SRC rotated right by ROT. */
//ZZ static IRExpr* genROR32( IRTemp src, Int rot )
//ZZ {
//ZZ    vassert(rot >= 0 && rot < 32);
//ZZ    if (rot == 0)
//ZZ       return mkexpr(src);
//ZZ    return
//ZZ       binop(Iop_Or32,
//ZZ             binop(Iop_Shl32, mkexpr(src), mkU8(32 - rot)),
//ZZ             binop(Iop_Shr32, mkexpr(src), mkU8(rot)));
//ZZ }
//ZZ 
//ZZ static IRExpr* mkU128 ( ULong i )
//ZZ {
//ZZ    return binop(Iop_64HLtoV128, mkU64(i), mkU64(i));
//ZZ }
//ZZ 
//ZZ /* Generate a 4-aligned version of the given expression if
//ZZ    the given condition is true.  Else return it unchanged. */
//ZZ static IRExpr* align4if ( IRExpr* e, Bool b )
//ZZ {
//ZZ    if (b)
//ZZ       return binop(Iop_And32, e, mkU32(~3));
//ZZ    else
//ZZ       return e;
//ZZ }

/* Other IR construction helpers. */
static IROp mkAND ( IRType ty ) {
   switch (ty) {
      case Ity_I32: return Iop_And32;
      case Ity_I64: return Iop_And64;
      default: vpanic("mkAND");
   }
}

static IROp mkOR ( IRType ty ) {
   switch (ty) {
      case Ity_I32: return Iop_Or32;
      case Ity_I64: return Iop_Or64;
      default: vpanic("mkOR");
   }
}

static IROp mkXOR ( IRType ty ) {
   switch (ty) {
      case Ity_I32: return Iop_Xor32;
      case Ity_I64: return Iop_Xor64;
      default: vpanic("mkXOR");
   }
}

static IROp mkSHL ( IRType ty ) {
   switch (ty) {
      case Ity_I32: return Iop_Shl32;
      case Ity_I64: return Iop_Shl64;
      default: vpanic("mkSHL");
   }
}

static IROp mkSHR ( IRType ty ) {
   switch (ty) {
      case Ity_I32: return Iop_Shr32;
      case Ity_I64: return Iop_Shr64;
      default: vpanic("mkSHR");
   }
}

static IROp mkSAR ( IRType ty ) {
   switch (ty) {
      case Ity_I32: return Iop_Sar32;
      case Ity_I64: return Iop_Sar64;
      default: vpanic("mkSAR");
   }
}

static IROp mkNOT ( IRType ty ) {
   switch (ty) {
      case Ity_I32: return Iop_Not32;
      case Ity_I64: return Iop_Not64;
      default: vpanic("mkNOT");
   }
}

static IROp mkADD ( IRType ty ) {
   switch (ty) {
      case Ity_I32: return Iop_Add32;
      case Ity_I64: return Iop_Add64;
      default: vpanic("mkADD");
   }
}

static IROp mkSUB ( IRType ty ) {
   switch (ty) {
      case Ity_I32: return Iop_Sub32;
      case Ity_I64: return Iop_Sub64;
      default: vpanic("mkSUB");
   }
}

static IROp mkADDF ( IRType ty ) {
   switch (ty) {
      case Ity_F32: return Iop_AddF32;
      case Ity_F64: return Iop_AddF64;
      default: vpanic("mkADDF");
   }
}

static IROp mkSUBF ( IRType ty ) {
   switch (ty) {
      case Ity_F32: return Iop_SubF32;
      case Ity_F64: return Iop_SubF64;
      default: vpanic("mkSUBF");
   }
}

static IROp mkMULF ( IRType ty ) {
   switch (ty) {
      case Ity_F32: return Iop_MulF32;
      case Ity_F64: return Iop_MulF64;
      default: vpanic("mkMULF");
   }
}

static IROp mkDIVF ( IRType ty ) {
   switch (ty) {
      case Ity_F32: return Iop_DivF32;
      case Ity_F64: return Iop_DivF64;
      default: vpanic("mkMULF");
   }
}

static IROp mkNEGF ( IRType ty ) {
   switch (ty) {
      case Ity_F32: return Iop_NegF32;
      case Ity_F64: return Iop_NegF64;
      default: vpanic("mkNEGF");
   }
}

static IROp mkABSF ( IRType ty ) {
   switch (ty) {
      case Ity_F32: return Iop_AbsF32;
      case Ity_F64: return Iop_AbsF64;
      default: vpanic("mkNEGF");
   }
}

static IROp mkSQRTF ( IRType ty ) {
   switch (ty) {
      case Ity_F32: return Iop_SqrtF32;
      case Ity_F64: return Iop_SqrtF64;
      default: vpanic("mkNEGF");
   }
}

static IROp mkVecADD ( UInt size ) {
   const IROp ops[4]
      = { Iop_Add8x16, Iop_Add16x8, Iop_Add32x4, Iop_Add64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQADDU ( UInt size ) {
   const IROp ops[4]
      = { Iop_QAdd8Ux16, Iop_QAdd16Ux8, Iop_QAdd32Ux4, Iop_QAdd64Ux2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQADDS ( UInt size ) {
   const IROp ops[4]
      = { Iop_QAdd8Sx16, Iop_QAdd16Sx8, Iop_QAdd32Sx4, Iop_QAdd64Sx2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQADDEXTSUSATUU ( UInt size ) {
   const IROp ops[4]
      = { Iop_QAddExtSUsatUU8x16, Iop_QAddExtSUsatUU16x8,
          Iop_QAddExtSUsatUU32x4, Iop_QAddExtSUsatUU64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQADDEXTUSSATSS ( UInt size ) {
   const IROp ops[4]
      = { Iop_QAddExtUSsatSS8x16, Iop_QAddExtUSsatSS16x8,
          Iop_QAddExtUSsatSS32x4, Iop_QAddExtUSsatSS64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecSUB ( UInt size ) {
   const IROp ops[4]
      = { Iop_Sub8x16, Iop_Sub16x8, Iop_Sub32x4, Iop_Sub64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQSUBU ( UInt size ) {
   const IROp ops[4]
      = { Iop_QSub8Ux16, Iop_QSub16Ux8, Iop_QSub32Ux4, Iop_QSub64Ux2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQSUBS ( UInt size ) {
   const IROp ops[4]
      = { Iop_QSub8Sx16, Iop_QSub16Sx8, Iop_QSub32Sx4, Iop_QSub64Sx2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecSARN ( UInt size ) {
   const IROp ops[4]
      = { Iop_SarN8x16, Iop_SarN16x8, Iop_SarN32x4, Iop_SarN64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecSHRN ( UInt size ) {
   const IROp ops[4]
      = { Iop_ShrN8x16, Iop_ShrN16x8, Iop_ShrN32x4, Iop_ShrN64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecSHLN ( UInt size ) {
   const IROp ops[4]
      = { Iop_ShlN8x16, Iop_ShlN16x8, Iop_ShlN32x4, Iop_ShlN64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecCATEVENLANES ( UInt size ) {
   const IROp ops[4]
      = { Iop_CatEvenLanes8x16, Iop_CatEvenLanes16x8,
          Iop_CatEvenLanes32x4, Iop_InterleaveLO64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecCATODDLANES ( UInt size ) {
   const IROp ops[4]
      = { Iop_CatOddLanes8x16, Iop_CatOddLanes16x8,
          Iop_CatOddLanes32x4, Iop_InterleaveHI64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecINTERLEAVELO ( UInt size ) {
   const IROp ops[4]
      = { Iop_InterleaveLO8x16, Iop_InterleaveLO16x8,
          Iop_InterleaveLO32x4, Iop_InterleaveLO64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecINTERLEAVEHI ( UInt size ) {
   const IROp ops[4]
      = { Iop_InterleaveHI8x16, Iop_InterleaveHI16x8,
          Iop_InterleaveHI32x4, Iop_InterleaveHI64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecMAXU ( UInt size ) {
   const IROp ops[4]
      = { Iop_Max8Ux16, Iop_Max16Ux8, Iop_Max32Ux4, Iop_Max64Ux2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecMAXS ( UInt size ) {
   const IROp ops[4]
      = { Iop_Max8Sx16, Iop_Max16Sx8, Iop_Max32Sx4, Iop_Max64Sx2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecMINU ( UInt size ) {
   const IROp ops[4]
      = { Iop_Min8Ux16, Iop_Min16Ux8, Iop_Min32Ux4, Iop_Min64Ux2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecMINS ( UInt size ) {
   const IROp ops[4]
      = { Iop_Min8Sx16, Iop_Min16Sx8, Iop_Min32Sx4, Iop_Min64Sx2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecMUL ( UInt size ) {
   const IROp ops[4]
      = { Iop_Mul8x16, Iop_Mul16x8, Iop_Mul32x4, Iop_INVALID };
   vassert(size < 3);
   return ops[size];
}

static IROp mkVecMULLU ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_Mull8Ux8, Iop_Mull16Ux4, Iop_Mull32Ux2, Iop_INVALID };
   vassert(sizeNarrow < 3);
   return ops[sizeNarrow];
}

static IROp mkVecMULLS ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_Mull8Sx8, Iop_Mull16Sx4, Iop_Mull32Sx2, Iop_INVALID };
   vassert(sizeNarrow < 3);
   return ops[sizeNarrow];
}

static IROp mkVecQDMULLS ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_INVALID, Iop_QDMull16Sx4, Iop_QDMull32Sx2, Iop_INVALID };
   vassert(sizeNarrow < 3);
   return ops[sizeNarrow];
}

static IROp mkVecCMPEQ ( UInt size ) {
   const IROp ops[4]
      = { Iop_CmpEQ8x16, Iop_CmpEQ16x8, Iop_CmpEQ32x4, Iop_CmpEQ64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecCMPGTU ( UInt size ) {
   const IROp ops[4]
      = { Iop_CmpGT8Ux16, Iop_CmpGT16Ux8, Iop_CmpGT32Ux4, Iop_CmpGT64Ux2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecCMPGTS ( UInt size ) {
   const IROp ops[4]
      = { Iop_CmpGT8Sx16, Iop_CmpGT16Sx8, Iop_CmpGT32Sx4, Iop_CmpGT64Sx2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecABS ( UInt size ) {
   const IROp ops[4]
      = { Iop_Abs8x16, Iop_Abs16x8, Iop_Abs32x4, Iop_Abs64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecZEROHIxxOFV128 ( UInt size ) {
   const IROp ops[4]
      = { Iop_ZeroHI120ofV128, Iop_ZeroHI112ofV128,
          Iop_ZeroHI96ofV128,  Iop_ZeroHI64ofV128 };
   vassert(size < 4);
   return ops[size];
}

static IRExpr* mkU ( IRType ty, ULong imm ) {
   switch (ty) {
      case Ity_I32: return mkU32((UInt)(imm & 0xFFFFFFFFULL));
      case Ity_I64: return mkU64(imm);
      default: vpanic("mkU");
   }
}

static IROp mkVecQDMULHIS ( UInt size ) {
   const IROp ops[4]
      = { Iop_INVALID, Iop_QDMulHi16Sx8, Iop_QDMulHi32Sx4, Iop_INVALID };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQRDMULHIS ( UInt size ) {
   const IROp ops[4]
      = { Iop_INVALID, Iop_QRDMulHi16Sx8, Iop_QRDMulHi32Sx4, Iop_INVALID };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQANDUQSH ( UInt size ) {
   const IROp ops[4]
      = { Iop_QandUQsh8x16, Iop_QandUQsh16x8,
          Iop_QandUQsh32x4, Iop_QandUQsh64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQANDSQSH ( UInt size ) {
   const IROp ops[4]
      = { Iop_QandSQsh8x16, Iop_QandSQsh16x8,
          Iop_QandSQsh32x4, Iop_QandSQsh64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQANDUQRSH ( UInt size ) {
   const IROp ops[4]
      = { Iop_QandUQRsh8x16, Iop_QandUQRsh16x8,
          Iop_QandUQRsh32x4, Iop_QandUQRsh64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQANDSQRSH ( UInt size ) {
   const IROp ops[4]
      = { Iop_QandSQRsh8x16, Iop_QandSQRsh16x8,
          Iop_QandSQRsh32x4, Iop_QandSQRsh64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecSHU ( UInt size ) {
   const IROp ops[4]
      = { Iop_Sh8Ux16, Iop_Sh16Ux8, Iop_Sh32Ux4, Iop_Sh64Ux2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecSHS ( UInt size ) {
   const IROp ops[4]
      = { Iop_Sh8Sx16, Iop_Sh16Sx8, Iop_Sh32Sx4, Iop_Sh64Sx2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecRSHU ( UInt size ) {
   const IROp ops[4]
      = { Iop_Rsh8Ux16, Iop_Rsh16Ux8, Iop_Rsh32Ux4, Iop_Rsh64Ux2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecRSHS ( UInt size ) {
   const IROp ops[4]
      = { Iop_Rsh8Sx16, Iop_Rsh16Sx8, Iop_Rsh32Sx4, Iop_Rsh64Sx2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecNARROWUN ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_NarrowUn16to8x8, Iop_NarrowUn32to16x4,
          Iop_NarrowUn64to32x2, Iop_INVALID };
   vassert(sizeNarrow < 4);
   return ops[sizeNarrow];
}

static IROp mkVecQNARROWUNSU ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_QNarrowUn16Sto8Ux8,  Iop_QNarrowUn32Sto16Ux4,
          Iop_QNarrowUn64Sto32Ux2, Iop_INVALID };
   vassert(sizeNarrow < 4);
   return ops[sizeNarrow];
}

static IROp mkVecQNARROWUNSS ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_QNarrowUn16Sto8Sx8,  Iop_QNarrowUn32Sto16Sx4,
          Iop_QNarrowUn64Sto32Sx2, Iop_INVALID };
   vassert(sizeNarrow < 4);
   return ops[sizeNarrow];
}

static IROp mkVecQNARROWUNUU ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_QNarrowUn16Uto8Ux8,  Iop_QNarrowUn32Uto16Ux4,
          Iop_QNarrowUn64Uto32Ux2, Iop_INVALID };
   vassert(sizeNarrow < 4);
   return ops[sizeNarrow];
}

static IROp mkVecQANDqshrNNARROWUU ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_QandQShrNnarrow16Uto8Ux8, Iop_QandQShrNnarrow32Uto16Ux4,
          Iop_QandQShrNnarrow64Uto32Ux2, Iop_INVALID };
   vassert(sizeNarrow < 4);
   return ops[sizeNarrow];
}

static IROp mkVecQANDqsarNNARROWSS ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_QandQSarNnarrow16Sto8Sx8,  Iop_QandQSarNnarrow32Sto16Sx4,
          Iop_QandQSarNnarrow64Sto32Sx2, Iop_INVALID };
   vassert(sizeNarrow < 4);
   return ops[sizeNarrow];
}

static IROp mkVecQANDqsarNNARROWSU ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_QandQSarNnarrow16Sto8Ux8,  Iop_QandQSarNnarrow32Sto16Ux4,
          Iop_QandQSarNnarrow64Sto32Ux2, Iop_INVALID };
   vassert(sizeNarrow < 4);
   return ops[sizeNarrow];
}

static IROp mkVecQANDqrshrNNARROWUU ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_QandQRShrNnarrow16Uto8Ux8,  Iop_QandQRShrNnarrow32Uto16Ux4,
          Iop_QandQRShrNnarrow64Uto32Ux2, Iop_INVALID };
   vassert(sizeNarrow < 4);
   return ops[sizeNarrow];
}

static IROp mkVecQANDqrsarNNARROWSS ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_QandQRSarNnarrow16Sto8Sx8,  Iop_QandQRSarNnarrow32Sto16Sx4,
          Iop_QandQRSarNnarrow64Sto32Sx2, Iop_INVALID };
   vassert(sizeNarrow < 4);
   return ops[sizeNarrow];
}

static IROp mkVecQANDqrsarNNARROWSU ( UInt sizeNarrow ) {
   const IROp ops[4]
      = { Iop_QandQRSarNnarrow16Sto8Ux8,  Iop_QandQRSarNnarrow32Sto16Ux4,
          Iop_QandQRSarNnarrow64Sto32Ux2, Iop_INVALID };
   vassert(sizeNarrow < 4);
   return ops[sizeNarrow];
}

static IROp mkVecQSHLNSATUU ( UInt size ) {
   const IROp ops[4]
      = { Iop_QShlNsatUU8x16, Iop_QShlNsatUU16x8,
          Iop_QShlNsatUU32x4, Iop_QShlNsatUU64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQSHLNSATSS ( UInt size ) {
   const IROp ops[4]
      = { Iop_QShlNsatSS8x16, Iop_QShlNsatSS16x8,
          Iop_QShlNsatSS32x4, Iop_QShlNsatSS64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecQSHLNSATSU ( UInt size ) {
   const IROp ops[4]
      = { Iop_QShlNsatSU8x16, Iop_QShlNsatSU16x8,
          Iop_QShlNsatSU32x4, Iop_QShlNsatSU64x2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecADDF ( UInt size ) {
   const IROp ops[4]
      = { Iop_INVALID, Iop_INVALID, Iop_Add32Fx4, Iop_Add64Fx2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecMAXF ( UInt size ) {
   const IROp ops[4]
      = { Iop_INVALID, Iop_INVALID, Iop_Max32Fx4, Iop_Max64Fx2 };
   vassert(size < 4);
   return ops[size];
}

static IROp mkVecMINF ( UInt size ) {
   const IROp ops[4]
      = { Iop_INVALID, Iop_INVALID, Iop_Min32Fx4, Iop_Min64Fx2 };
   vassert(size < 4);
   return ops[size];
}

/* Generate IR to create 'arg rotated right by imm', for sane values
   of 'ty' and 'imm'. */
static IRTemp mathROR ( IRType ty, IRTemp arg, UInt imm )
{
   UInt w = 0;
   if (ty == Ity_I64) {
      w = 64;
   } else {
      vassert(ty == Ity_I32);
      w = 32;
   }
   vassert(w != 0);
   vassert(imm < w);
   if (imm == 0) {
      return arg;
   }
   IRTemp res = newTemp(ty);
   assign(res, binop(mkOR(ty),
                     binop(mkSHL(ty), mkexpr(arg), mkU8(w - imm)),
                     binop(mkSHR(ty), mkexpr(arg), mkU8(imm)) ));
   return res;
}

/* Generate IR to set the returned temp to either all-zeroes or
   all ones, as a copy of arg<imm>. */
static IRTemp mathREPLICATE ( IRType ty, IRTemp arg, UInt imm )
{
   UInt w = 0;
   if (ty == Ity_I64) {
      w = 64;
   } else {
      vassert(ty == Ity_I32);
      w = 32;
   }
   vassert(w != 0);
   vassert(imm < w);
   IRTemp res = newTemp(ty);
   assign(res, binop(mkSAR(ty),
                     binop(mkSHL(ty), mkexpr(arg), mkU8(w - 1 - imm)),
                     mkU8(w - 1)));
   return res;
}

/* U-widen 8/16/32/64 bit int expr to 64. */
static IRExpr* widenUto64 ( IRType srcTy, IRExpr* e )
{
   switch (srcTy) {
      case Ity_I64: return e;
      case Ity_I32: return unop(Iop_32Uto64, e);
      case Ity_I16: return unop(Iop_16Uto64, e);
      case Ity_I8:  return unop(Iop_8Uto64, e);
      default: vpanic("widenUto64(arm64)");
   }
}

/* Narrow 64 bit int expr to 8/16/32/64.  Clearly only some
   of these combinations make sense. */
static IRExpr* narrowFrom64 ( IRType dstTy, IRExpr* e )
{
   switch (dstTy) {
      case Ity_I64: return e;
      case Ity_I32: return unop(Iop_64to32, e);
      case Ity_I16: return unop(Iop_64to16, e);
      case Ity_I8:  return unop(Iop_64to8, e);
      default: vpanic("narrowFrom64(arm64)");
   }
}

/*------------------------------------------------------------*/
/*--- Helpers for accessing guest registers.               ---*/
/*------------------------------------------------------------*/

#define OFFB_X0       offsetof(VexGuestARM64State,guest_X0)
#define OFFB_X1       offsetof(VexGuestARM64State,guest_X1)
#define OFFB_X2       offsetof(VexGuestARM64State,guest_X2)
#define OFFB_X3       offsetof(VexGuestARM64State,guest_X3)
#define OFFB_X4       offsetof(VexGuestARM64State,guest_X4)
#define OFFB_X5       offsetof(VexGuestARM64State,guest_X5)
#define OFFB_X6       offsetof(VexGuestARM64State,guest_X6)
#define OFFB_X7       offsetof(VexGuestARM64State,guest_X7)
#define OFFB_X8       offsetof(VexGuestARM64State,guest_X8)
#define OFFB_X9       offsetof(VexGuestARM64State,guest_X9)
#define OFFB_X10      offsetof(VexGuestARM64State,guest_X10)
#define OFFB_X11      offsetof(VexGuestARM64State,guest_X11)
#define OFFB_X12      offsetof(VexGuestARM64State,guest_X12)
#define OFFB_X13      offsetof(VexGuestARM64State,guest_X13)
#define OFFB_X14      offsetof(VexGuestARM64State,guest_X14)
#define OFFB_X15      offsetof(VexGuestARM64State,guest_X15)
#define OFFB_X16      offsetof(VexGuestARM64State,guest_X16)
#define OFFB_X17      offsetof(VexGuestARM64State,guest_X17)
#define OFFB_X18      offsetof(VexGuestARM64State,guest_X18)
#define OFFB_X19      offsetof(VexGuestARM64State,guest_X19)
#define OFFB_X20      offsetof(VexGuestARM64State,guest_X20)
#define OFFB_X21      offsetof(VexGuestARM64State,guest_X21)
#define OFFB_X22      offsetof(VexGuestARM64State,guest_X22)
#define OFFB_X23      offsetof(VexGuestARM64State,guest_X23)
#define OFFB_X24      offsetof(VexGuestARM64State,guest_X24)
#define OFFB_X25      offsetof(VexGuestARM64State,guest_X25)
#define OFFB_X26      offsetof(VexGuestARM64State,guest_X26)
#define OFFB_X27      offsetof(VexGuestARM64State,guest_X27)
#define OFFB_X28      offsetof(VexGuestARM64State,guest_X28)
#define OFFB_X29      offsetof(VexGuestARM64State,guest_X29)
#define OFFB_X30      offsetof(VexGuestARM64State,guest_X30)

#define OFFB_XSP      offsetof(VexGuestARM64State,guest_XSP)
#define OFFB_PC       offsetof(VexGuestARM64State,guest_PC)

#define OFFB_CC_OP    offsetof(VexGuestARM64State,guest_CC_OP)
#define OFFB_CC_DEP1  offsetof(VexGuestARM64State,guest_CC_DEP1)
#define OFFB_CC_DEP2  offsetof(VexGuestARM64State,guest_CC_DEP2)
#define OFFB_CC_NDEP  offsetof(VexGuestARM64State,guest_CC_NDEP)

#define OFFB_TPIDR_EL0 offsetof(VexGuestARM64State,guest_TPIDR_EL0)
#define OFFB_NRADDR   offsetof(VexGuestARM64State,guest_NRADDR)

#define OFFB_Q0       offsetof(VexGuestARM64State,guest_Q0)
#define OFFB_Q1       offsetof(VexGuestARM64State,guest_Q1)
#define OFFB_Q2       offsetof(VexGuestARM64State,guest_Q2)
#define OFFB_Q3       offsetof(VexGuestARM64State,guest_Q3)
#define OFFB_Q4       offsetof(VexGuestARM64State,guest_Q4)
#define OFFB_Q5       offsetof(VexGuestARM64State,guest_Q5)
#define OFFB_Q6       offsetof(VexGuestARM64State,guest_Q6)
#define OFFB_Q7       offsetof(VexGuestARM64State,guest_Q7)
#define OFFB_Q8       offsetof(VexGuestARM64State,guest_Q8)
#define OFFB_Q9       offsetof(VexGuestARM64State,guest_Q9)
#define OFFB_Q10      offsetof(VexGuestARM64State,guest_Q10)
#define OFFB_Q11      offsetof(VexGuestARM64State,guest_Q11)
#define OFFB_Q12      offsetof(VexGuestARM64State,guest_Q12)
#define OFFB_Q13      offsetof(VexGuestARM64State,guest_Q13)
#define OFFB_Q14      offsetof(VexGuestARM64State,guest_Q14)
#define OFFB_Q15      offsetof(VexGuestARM64State,guest_Q15)
#define OFFB_Q16      offsetof(VexGuestARM64State,guest_Q16)
#define OFFB_Q17      offsetof(VexGuestARM64State,guest_Q17)
#define OFFB_Q18      offsetof(VexGuestARM64State,guest_Q18)
#define OFFB_Q19      offsetof(VexGuestARM64State,guest_Q19)
#define OFFB_Q20      offsetof(VexGuestARM64State,guest_Q20)
#define OFFB_Q21      offsetof(VexGuestARM64State,guest_Q21)
#define OFFB_Q22      offsetof(VexGuestARM64State,guest_Q22)
#define OFFB_Q23      offsetof(VexGuestARM64State,guest_Q23)
#define OFFB_Q24      offsetof(VexGuestARM64State,guest_Q24)
#define OFFB_Q25      offsetof(VexGuestARM64State,guest_Q25)
#define OFFB_Q26      offsetof(VexGuestARM64State,guest_Q26)
#define OFFB_Q27      offsetof(VexGuestARM64State,guest_Q27)
#define OFFB_Q28      offsetof(VexGuestARM64State,guest_Q28)
#define OFFB_Q29      offsetof(VexGuestARM64State,guest_Q29)
#define OFFB_Q30      offsetof(VexGuestARM64State,guest_Q30)
#define OFFB_Q31      offsetof(VexGuestARM64State,guest_Q31)

#define OFFB_FPCR     offsetof(VexGuestARM64State,guest_FPCR)
#define OFFB_QCFLAG   offsetof(VexGuestARM64State,guest_QCFLAG)

#define OFFB_CMSTART  offsetof(VexGuestARM64State,guest_CMSTART)
#define OFFB_CMLEN    offsetof(VexGuestARM64State,guest_CMLEN)

/* ---------------- Integer registers ---------------- */

static Int offsetIReg64 ( UInt iregNo )
{
   /* Do we care about endianness here?  We do if sub-parts of integer
      registers are accessed. */
   switch (iregNo) {
      case 0:  return OFFB_X0;
      case 1:  return OFFB_X1;
      case 2:  return OFFB_X2;
      case 3:  return OFFB_X3;
      case 4:  return OFFB_X4;
      case 5:  return OFFB_X5;
      case 6:  return OFFB_X6;
      case 7:  return OFFB_X7;
      case 8:  return OFFB_X8;
      case 9:  return OFFB_X9;
      case 10: return OFFB_X10;
      case 11: return OFFB_X11;
      case 12: return OFFB_X12;
      case 13: return OFFB_X13;
      case 14: return OFFB_X14;
      case 15: return OFFB_X15;
      case 16: return OFFB_X16;
      case 17: return OFFB_X17;
      case 18: return OFFB_X18;
      case 19: return OFFB_X19;
      case 20: return OFFB_X20;
      case 21: return OFFB_X21;
      case 22: return OFFB_X22;
      case 23: return OFFB_X23;
      case 24: return OFFB_X24;
      case 25: return OFFB_X25;
      case 26: return OFFB_X26;
      case 27: return OFFB_X27;
      case 28: return OFFB_X28;
      case 29: return OFFB_X29;
      case 30: return OFFB_X30;
      /* but not 31 */
      default: vassert(0);
   }
}

static Int offsetIReg64orSP ( UInt iregNo )
{
   return iregNo == 31  ? OFFB_XSP  : offsetIReg64(iregNo);
}

static const HChar* nameIReg64orZR ( UInt iregNo )
{
   vassert(iregNo < 32);
   static const HChar* names[32]
      = { "x0",  "x1",  "x2",  "x3",  "x4",  "x5",  "x6",  "x7", 
          "x8",  "x9",  "x10", "x11", "x12", "x13", "x14", "x15", 
          "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23", 
          "x24", "x25", "x26", "x27", "x28", "x29", "x30", "xzr" };
   return names[iregNo];
}

static const HChar* nameIReg64orSP ( UInt iregNo )
{
   if (iregNo == 31) {
      return "sp";
   }
   vassert(iregNo < 31);
   return nameIReg64orZR(iregNo);
}

static IRExpr* getIReg64orSP ( UInt iregNo )
{
   vassert(iregNo < 32);
   return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
}

static IRExpr* getIReg64orZR ( UInt iregNo )
{
   if (iregNo == 31) {
      return mkU64(0);
   }
   vassert(iregNo < 31);
   return IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 );
}

static void putIReg64orSP ( UInt iregNo, IRExpr* e ) 
{
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
}

static void putIReg64orZR ( UInt iregNo, IRExpr* e ) 
{
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   if (iregNo == 31) {
      return;
   }
   vassert(iregNo < 31);
   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), e) );
}

static const HChar* nameIReg32orZR ( UInt iregNo )
{
   vassert(iregNo < 32);
   static const HChar* names[32]
      = { "w0",  "w1",  "w2",  "w3",  "w4",  "w5",  "w6",  "w7", 
          "w8",  "w9",  "w10", "w11", "w12", "w13", "w14", "w15", 
          "w16", "w17", "w18", "w19", "w20", "w21", "w22", "w23", 
          "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wzr" };
   return names[iregNo];
}

static const HChar* nameIReg32orSP ( UInt iregNo )
{
   if (iregNo == 31) {
      return "wsp";
   }
   vassert(iregNo < 31);
   return nameIReg32orZR(iregNo);
}

static IRExpr* getIReg32orSP ( UInt iregNo )
{
   vassert(iregNo < 32);
   return unop(Iop_64to32,
               IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
}

static IRExpr* getIReg32orZR ( UInt iregNo )
{
   if (iregNo == 31) {
      return mkU32(0);
   }
   vassert(iregNo < 31);
   return unop(Iop_64to32,
               IRExpr_Get( offsetIReg64orSP(iregNo), Ity_I64 ));
}

static void putIReg32orSP ( UInt iregNo, IRExpr* e ) 
{
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
}

static void putIReg32orZR ( UInt iregNo, IRExpr* e ) 
{
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
   if (iregNo == 31) {
      return;
   }
   vassert(iregNo < 31);
   stmt( IRStmt_Put(offsetIReg64orSP(iregNo), unop(Iop_32Uto64, e)) );
}

static const HChar* nameIRegOrSP ( Bool is64, UInt iregNo )
{
   vassert(is64 == True || is64 == False);
   return is64 ? nameIReg64orSP(iregNo) : nameIReg32orSP(iregNo);
}

static const HChar* nameIRegOrZR ( Bool is64, UInt iregNo )
{
   vassert(is64 == True || is64 == False);
   return is64 ? nameIReg64orZR(iregNo) : nameIReg32orZR(iregNo);
}

static IRExpr* getIRegOrZR ( Bool is64, UInt iregNo )
{
   vassert(is64 == True || is64 == False);
   return is64 ? getIReg64orZR(iregNo) : getIReg32orZR(iregNo);
}

static void putIRegOrZR ( Bool is64, UInt iregNo, IRExpr* e )
{
   vassert(is64 == True || is64 == False);
   if (is64) putIReg64orZR(iregNo, e); else putIReg32orZR(iregNo, e);
}

static void putPC ( IRExpr* e )
{
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
   stmt( IRStmt_Put(OFFB_PC, e) );
}

/* ---------------- Vector (Q) registers ---------------- */

static Int offsetQReg128 ( UInt qregNo )
{
   /* We don't care about endianness at this point.  It only becomes
      relevant when dealing with sections of these registers.*/
   switch (qregNo) {
      case 0:  return OFFB_Q0;
      case 1:  return OFFB_Q1;
      case 2:  return OFFB_Q2;
      case 3:  return OFFB_Q3;
      case 4:  return OFFB_Q4;
      case 5:  return OFFB_Q5;
      case 6:  return OFFB_Q6;
      case 7:  return OFFB_Q7;
      case 8:  return OFFB_Q8;
      case 9:  return OFFB_Q9;
      case 10: return OFFB_Q10;
      case 11: return OFFB_Q11;
      case 12: return OFFB_Q12;
      case 13: return OFFB_Q13;
      case 14: return OFFB_Q14;
      case 15: return OFFB_Q15;
      case 16: return OFFB_Q16;
      case 17: return OFFB_Q17;
      case 18: return OFFB_Q18;
      case 19: return OFFB_Q19;
      case 20: return OFFB_Q20;
      case 21: return OFFB_Q21;
      case 22: return OFFB_Q22;
      case 23: return OFFB_Q23;
      case 24: return OFFB_Q24;
      case 25: return OFFB_Q25;
      case 26: return OFFB_Q26;
      case 27: return OFFB_Q27;
      case 28: return OFFB_Q28;
      case 29: return OFFB_Q29;
      case 30: return OFFB_Q30;
      case 31: return OFFB_Q31;
      default: vassert(0);
   }
}

/* Write to a complete Qreg. */
static void putQReg128 ( UInt qregNo, IRExpr* e )
{
   vassert(qregNo < 32);
   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_V128);
   stmt( IRStmt_Put(offsetQReg128(qregNo), e) );
}

/* Read a complete Qreg. */
static IRExpr* getQReg128 ( UInt qregNo )
{
   vassert(qregNo < 32);
   return IRExpr_Get(offsetQReg128(qregNo), Ity_V128);
}

/* Produce the IR type for some sub-part of a vector.  For 32- and 64-
   bit sub-parts we can choose either integer or float types, and
   choose float on the basis that that is the common use case and so
   will give least interference with Put-to-Get forwarding later
   on. */
static IRType preferredVectorSubTypeFromSize ( UInt szB )
{
   switch (szB) {
      case 1:  return Ity_I8;
      case 2:  return Ity_I16;
      case 4:  return Ity_I32; //Ity_F32;
      case 8:  return Ity_F64;
      case 16: return Ity_V128;
      default: vassert(0);
   }
}

/* Find the offset of the laneNo'th lane of type laneTy in the given
   Qreg.  Since the host is little-endian, the least significant lane
   has the lowest offset. */
static Int offsetQRegLane ( UInt qregNo, IRType laneTy, UInt laneNo )
{
   vassert(host_endness == VexEndnessLE);
   Int base = offsetQReg128(qregNo);
   /* Since the host is little-endian, the least significant lane
      will be at the lowest address. */
   /* Restrict this to known types, so as to avoid silently accepting
      stupid types. */
   UInt laneSzB = 0;
   switch (laneTy) {
      case Ity_I8:                 laneSzB = 1;  break;
      case Ity_F16: case Ity_I16:  laneSzB = 2;  break;
      case Ity_F32: case Ity_I32:  laneSzB = 4;  break;
      case Ity_F64: case Ity_I64:  laneSzB = 8;  break;
      case Ity_V128:               laneSzB = 16; break;
      default: break;
   }
   vassert(laneSzB > 0);
   UInt minOff = laneNo * laneSzB;
   UInt maxOff = minOff + laneSzB - 1;
   vassert(maxOff < 16);
   return base + minOff;
}

/* Put to the least significant lane of a Qreg. */
static void putQRegLO ( UInt qregNo, IRExpr* e )
{
   IRType ty  = typeOfIRExpr(irsb->tyenv, e);
   Int    off = offsetQRegLane(qregNo, ty, 0);
   switch (ty) {
      case Ity_I8:  case Ity_I16: case Ity_I32: case Ity_I64:
      case Ity_F16: case Ity_F32: case Ity_F64: case Ity_V128:
         break;
      default:
         vassert(0); // Other cases are probably invalid
   }
   stmt(IRStmt_Put(off, e));
}

/* Get from the least significant lane of a Qreg. */
static IRExpr* getQRegLO ( UInt qregNo, IRType ty )
{
   Int off = offsetQRegLane(qregNo, ty, 0);
   switch (ty) {
      case Ity_I8:
      case Ity_F16: case Ity_I16:
      case Ity_I32: case Ity_I64:
      case Ity_F32: case Ity_F64: case Ity_V128:
         break;
      default:
         vassert(0); // Other cases are ATC
   }
   return IRExpr_Get(off, ty);
}

static const HChar* nameQRegLO ( UInt qregNo, IRType laneTy )
{
   static const HChar* namesQ[32]
      = { "q0",  "q1",  "q2",  "q3",  "q4",  "q5",  "q6",  "q7", 
          "q8",  "q9",  "q10", "q11", "q12", "q13", "q14", "q15", 
          "q16", "q17", "q18", "q19", "q20", "q21", "q22", "q23", 
          "q24", "q25", "q26", "q27", "q28", "q29", "q30", "q31" };
   static const HChar* namesD[32]
      = { "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7", 
          "d8",  "d9",  "d10", "d11", "d12", "d13", "d14", "d15", 
          "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", 
          "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" };
   static const HChar* namesS[32]
      = { "s0",  "s1",  "s2",  "s3",  "s4",  "s5",  "s6",  "s7", 
          "s8",  "s9",  "s10", "s11", "s12", "s13", "s14", "s15", 
          "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", 
          "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31" };
   static const HChar* namesH[32]
      = { "h0",  "h1",  "h2",  "h3",  "h4",  "h5",  "h6",  "h7", 
          "h8",  "h9",  "h10", "h11", "h12", "h13", "h14", "h15", 
          "h16", "h17", "h18", "h19", "h20", "h21", "h22", "h23", 
          "h24", "h25", "h26", "h27", "h28", "h29", "h30", "h31" };
   static const HChar* namesB[32]
      = { "b0",  "b1",  "b2",  "b3",  "b4",  "b5",  "b6",  "b7", 
          "b8",  "b9",  "b10", "b11", "b12", "b13", "b14", "b15", 
          "b16", "b17", "b18", "b19", "b20", "b21", "b22", "b23", 
          "b24", "b25", "b26", "b27", "b28", "b29", "b30", "b31" };
   vassert(qregNo < 32);
   switch (sizeofIRType(laneTy)) {
      case 1:  return namesB[qregNo];
      case 2:  return namesH[qregNo];
      case 4:  return namesS[qregNo];
      case 8:  return namesD[qregNo];
      case 16: return namesQ[qregNo];
      default: vassert(0);
   }
   /*NOTREACHED*/
}

static const HChar* nameQReg128 ( UInt qregNo )
{
   return nameQRegLO(qregNo, Ity_V128);
}

/* Find the offset of the most significant half (8 bytes) of the given
   Qreg.  This requires knowing the endianness of the host. */
static Int offsetQRegHI64 ( UInt qregNo )
{
   return offsetQRegLane(qregNo, Ity_I64, 1);
}

static IRExpr* getQRegHI64 ( UInt qregNo )
{
   return IRExpr_Get(offsetQRegHI64(qregNo), Ity_I64);
}

static void putQRegHI64 ( UInt qregNo, IRExpr* e )
{
   IRType ty  = typeOfIRExpr(irsb->tyenv, e);
   Int    off = offsetQRegHI64(qregNo);
   switch (ty) {
      case Ity_I64: case Ity_F64:
         break;
      default:
         vassert(0); // Other cases are plain wrong
   }
   stmt(IRStmt_Put(off, e));
}

/* Put to a specified lane of a Qreg. */
static void putQRegLane ( UInt qregNo, UInt laneNo, IRExpr* e )
{
   IRType laneTy  = typeOfIRExpr(irsb->tyenv, e);
   Int    off     = offsetQRegLane(qregNo, laneTy, laneNo);
   switch (laneTy) {
      case Ity_F64: case Ity_I64:
      case Ity_I32: case Ity_F32:
      case Ity_I16: case Ity_F16:
      case Ity_I8:
         break;
      default:
         vassert(0); // Other cases are ATC
   }
   stmt(IRStmt_Put(off, e));
}

/* Get from a specified lane of a Qreg. */
static IRExpr* getQRegLane ( UInt qregNo, UInt laneNo, IRType laneTy )
{
   Int off = offsetQRegLane(qregNo, laneTy, laneNo);
   switch (laneTy) {
      case Ity_I64: case Ity_I32: case Ity_I16: case Ity_I8:
      case Ity_F64: case Ity_F32: case Ity_F16:
         break;
      default:
         vassert(0); // Other cases are ATC
   }
   return IRExpr_Get(off, laneTy);
}

//ZZ /* ---------------- Misc registers ---------------- */
//ZZ 
//ZZ static void putMiscReg32 ( UInt    gsoffset, 
//ZZ                            IRExpr* e, /* :: Ity_I32 */
//ZZ                            IRTemp  guardT /* :: Ity_I32, 0 or 1 */)
//ZZ {
//ZZ    switch (gsoffset) {
//ZZ       case OFFB_FPSCR:   break;
//ZZ       case OFFB_QFLAG32: break;
//ZZ       case OFFB_GEFLAG0: break;
//ZZ       case OFFB_GEFLAG1: break;
//ZZ       case OFFB_GEFLAG2: break;
//ZZ       case OFFB_GEFLAG3: break;
//ZZ       default: vassert(0); /* awaiting more cases */
//ZZ    }
//ZZ    vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
//ZZ 
//ZZ    if (guardT == IRTemp_INVALID) {
//ZZ       /* unconditional write */
//ZZ       stmt(IRStmt_Put(gsoffset, e));
//ZZ    } else {
//ZZ       stmt(IRStmt_Put(
//ZZ          gsoffset,
//ZZ          IRExpr_ITE( binop(Iop_CmpNE32, mkexpr(guardT), mkU32(0)),
//ZZ                      e, IRExpr_Get(gsoffset, Ity_I32) )
//ZZ       ));
//ZZ    }
//ZZ }
//ZZ 
//ZZ static IRTemp get_ITSTATE ( void )
//ZZ {
//ZZ    ASSERT_IS_THUMB;
//ZZ    IRTemp t = newTemp(Ity_I32);
//ZZ    assign(t, IRExpr_Get( OFFB_ITSTATE, Ity_I32));
//ZZ    return t;
//ZZ }
//ZZ 
//ZZ static void put_ITSTATE ( IRTemp t )
//ZZ {
//ZZ    ASSERT_IS_THUMB;
//ZZ    stmt( IRStmt_Put( OFFB_ITSTATE, mkexpr(t)) );
//ZZ }
//ZZ 
//ZZ static IRTemp get_QFLAG32 ( void )
//ZZ {
//ZZ    IRTemp t = newTemp(Ity_I32);
//ZZ    assign(t, IRExpr_Get( OFFB_QFLAG32, Ity_I32));
//ZZ    return t;
//ZZ }
//ZZ 
//ZZ static void put_QFLAG32 ( IRTemp t, IRTemp condT )
//ZZ {
//ZZ    putMiscReg32( OFFB_QFLAG32, mkexpr(t), condT );
//ZZ }
//ZZ 
//ZZ /* Stickily set the 'Q' flag (APSR bit 27) of the APSR (Application Program
//ZZ    Status Register) to indicate that overflow or saturation occurred.
//ZZ    Nb: t must be zero to denote no saturation, and any nonzero
//ZZ    value to indicate saturation. */
//ZZ static void or_into_QFLAG32 ( IRExpr* e, IRTemp condT )
//ZZ {
//ZZ    IRTemp old = get_QFLAG32();
//ZZ    IRTemp nyu = newTemp(Ity_I32);
//ZZ    assign(nyu, binop(Iop_Or32, mkexpr(old), e) );
//ZZ    put_QFLAG32(nyu, condT);
//ZZ }

/* ---------------- FPCR stuff ---------------- */

/* Generate IR to get hold of the rounding mode bits in FPCR, and
   convert them to IR format.  Bind the final result to the
   returned temp. */
static IRTemp /* :: Ity_I32 */ mk_get_IR_rounding_mode ( void )
{
   /* The ARMvfp encoding for rounding mode bits is:
         00  to nearest
         01  to +infinity
         10  to -infinity
         11  to zero
      We need to convert that to the IR encoding:
         00  to nearest (the default)
         10  to +infinity
         01  to -infinity
         11  to zero
      Which can be done by swapping bits 0 and 1.
      The rmode bits are at 23:22 in FPSCR.
   */
   IRTemp armEncd = newTemp(Ity_I32);
   IRTemp swapped = newTemp(Ity_I32);
   /* Fish FPCR[23:22] out, and slide to bottom.  Doesn't matter that
      we don't zero out bits 24 and above, since the assignment to
      'swapped' will mask them out anyway. */
   assign(armEncd,
          binop(Iop_Shr32, IRExpr_Get(OFFB_FPCR, Ity_I32), mkU8(22)));
   /* Now swap them. */
   assign(swapped,
          binop(Iop_Or32,
                binop(Iop_And32,
                      binop(Iop_Shl32, mkexpr(armEncd), mkU8(1)),
                      mkU32(2)),
                binop(Iop_And32,
                      binop(Iop_Shr32, mkexpr(armEncd), mkU8(1)),
                      mkU32(1))
         ));
   return swapped;
}

/*------------------------------------------------------------*/
/*--- Helpers for flag handling and conditional insns      ---*/
/*------------------------------------------------------------*/

static const HChar* nameARM64Condcode ( ARM64Condcode cond )
{
   switch (cond) {
      case ARM64CondEQ:  return "eq";
      case ARM64CondNE:  return "ne";
      case ARM64CondCS:  return "cs";  // or 'hs'
      case ARM64CondCC:  return "cc";  // or 'lo'
      case ARM64CondMI:  return "mi";
      case ARM64CondPL:  return "pl";
      case ARM64CondVS:  return "vs";
      case ARM64CondVC:  return "vc";
      case ARM64CondHI:  return "hi";
      case ARM64CondLS:  return "ls";
      case ARM64CondGE:  return "ge";
      case ARM64CondLT:  return "lt";
      case ARM64CondGT:  return "gt";
      case ARM64CondLE:  return "le";
      case ARM64CondAL:  return "al";
      case ARM64CondNV:  return "nv";
      default: vpanic("name_ARM64Condcode");
   }
}

/* and a handy shorthand for it */
static const HChar* nameCC ( ARM64Condcode cond ) {
   return nameARM64Condcode(cond);
}

/* Build IR to calculate some particular condition from stored
   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
   Ity_I64, suitable for narrowing.  Although the return type is
   Ity_I64, the returned value is either 0 or 1.  'cond' must be
   :: Ity_I64 and must denote the condition to compute in 
   bits 7:4, and be zero everywhere else.
*/
static IRExpr* mk_arm64g_calculate_condition_dyn ( IRExpr* cond )
{
   vassert(typeOfIRExpr(irsb->tyenv, cond) == Ity_I64);
   /* And 'cond' had better produce a value in which only bits 7:4 are
      nonzero.  However, obviously we can't assert for that. */

/* So what we're constructing for the first argument is 
      "(cond << 4) | stored-operation".
      However, as per comments above, 'cond' must be supplied
      pre-shifted to this function.

This pairing scheme requires that the ARM64_CC_OP_ values all fit
      in 4 bits.  Hence we are passing a (COND, OP) pair in the lowest
      8 bits of the first argument. */
   IRExpr** args
      = mkIRExprVec_4(
           binop(Iop_Or64, IRExpr_Get(OFFB_CC_OP, Ity_I64), cond),
           IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
           IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
           IRExpr_Get(OFFB_CC_NDEP, Ity_I64)
        );
   IRExpr* call
      = mkIRExprCCall(
           Ity_I64,
           0/*regparm*/, 
           "arm64g_calculate_condition", &arm64g_calculate_condition,
           args
        );

/* Exclude the requested condition, OP and NDEP from definedness
      checking.  We're only interested in DEP1 and DEP2. */
   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   return call;
}

/* Build IR to calculate some particular condition from stored
   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression of type
   Ity_I64, suitable for narrowing.  Although the return type is
   Ity_I64, the returned value is either 0 or 1.
*/
static IRExpr* mk_arm64g_calculate_condition ( ARM64Condcode cond )
{
  /* First arg is "(cond << 4) | condition".  This requires that the
     ARM64_CC_OP_ values all fit in 4 bits.  Hence we are passing a
     (COND, OP) pair in the lowest 8 bits of the first argument. */
   vassert(cond >= 0 && cond <= 15);
   return mk_arm64g_calculate_condition_dyn( mkU64(cond << 4) );
}

/* Build IR to calculate just the carry flag from stored
   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
   Ity_I64. */
static IRExpr* mk_arm64g_calculate_flag_c ( void )
{
   IRExpr** args
      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   IRExpr* call
      = mkIRExprCCall(
           Ity_I64,
           0/*regparm*/,
           "arm64g_calculate_flag_c", &arm64g_calculate_flag_c,
           args
        );
   /* Exclude OP and NDEP from definedness checking.  We're only
      interested in DEP1 and DEP2. */
   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   return call;
}

//ZZ /* Build IR to calculate just the overflow flag from stored
//ZZ    CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
//ZZ    Ity_I32. */
//ZZ static IRExpr* mk_armg_calculate_flag_v ( void )
//ZZ {
//ZZ    IRExpr** args
//ZZ       = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
//ZZ                        IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
//ZZ                        IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
//ZZ                        IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
//ZZ    IRExpr* call
//ZZ       = mkIRExprCCall(
//ZZ            Ity_I32,
//ZZ            0/*regparm*/, 
//ZZ            "armg_calculate_flag_v", &armg_calculate_flag_v,
//ZZ            args
//ZZ         );
//ZZ    /* Exclude OP and NDEP from definedness checking.  We're only
//ZZ       interested in DEP1 and DEP2. */
//ZZ    call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
//ZZ    return call;
//ZZ }

/* Build IR to calculate N Z C V in bits 31:28 of the
   returned word. */
static IRExpr* mk_arm64g_calculate_flags_nzcv ( void )
{
   IRExpr** args
      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
   IRExpr* call
      = mkIRExprCCall(
           Ity_I64,
           0/*regparm*/, 
           "arm64g_calculate_flags_nzcv", &arm64g_calculate_flags_nzcv,
           args
        );
   /* Exclude OP and NDEP from definedness checking.  We're only
      interested in DEP1 and DEP2. */
   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
   return call;
}

/* Build IR to set the flags thunk, in the most general case. */
static
void setFlags_D1_D2_ND ( UInt cc_op,
                         IRTemp t_dep1, IRTemp t_dep2, IRTemp t_ndep )
{
   vassert(typeOfIRTemp(irsb->tyenv, t_dep1 == Ity_I64));
   vassert(typeOfIRTemp(irsb->tyenv, t_dep2 == Ity_I64));
   vassert(typeOfIRTemp(irsb->tyenv, t_ndep == Ity_I64));
   vassert(cc_op >= ARM64G_CC_OP_COPY && cc_op < ARM64G_CC_OP_NUMBER);
   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(cc_op) ));
   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t_dep1) ));
   stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(t_dep2) ));
   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(t_ndep) ));
}

/* Build IR to set the flags thunk after ADD or SUB. */
static
void setFlags_ADD_SUB ( Bool is64, Bool isSUB, IRTemp argL, IRTemp argR )
{
   IRTemp argL64 = IRTemp_INVALID;
   IRTemp argR64 = IRTemp_INVALID;
   IRTemp z64    = newTemp(Ity_I64);
   if (is64) {
      argL64 = argL;
      argR64 = argR;
   } else {
      argL64 = newTemp(Ity_I64);
      argR64 = newTemp(Ity_I64);
      assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
      assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
   }
   assign(z64, mkU64(0));
   UInt cc_op = ARM64G_CC_OP_NUMBER;
   /**/ if ( isSUB &&  is64) { cc_op = ARM64G_CC_OP_SUB64; }
   else if ( isSUB && !is64) { cc_op = ARM64G_CC_OP_SUB32; }
   else if (!isSUB &&  is64) { cc_op = ARM64G_CC_OP_ADD64; }
   else if (!isSUB && !is64) { cc_op = ARM64G_CC_OP_ADD32; }
   else                      { vassert(0); }
   setFlags_D1_D2_ND(cc_op, argL64, argR64, z64);
}

/* Build IR to set the flags thunk after ADC or SBC. */
static
void setFlags_ADC_SBC ( Bool is64, Bool isSBC,
                        IRTemp argL, IRTemp argR, IRTemp oldC )
{
   IRTemp argL64 = IRTemp_INVALID;
   IRTemp argR64 = IRTemp_INVALID;
   IRTemp oldC64 = IRTemp_INVALID;
   if (is64) {
      argL64 = argL;
      argR64 = argR;
      oldC64 = oldC;
   } else {
      argL64 = newTemp(Ity_I64);
      argR64 = newTemp(Ity_I64);
      oldC64 = newTemp(Ity_I64);
      assign(argL64, unop(Iop_32Uto64, mkexpr(argL)));
      assign(argR64, unop(Iop_32Uto64, mkexpr(argR)));
      assign(oldC64, unop(Iop_32Uto64, mkexpr(oldC)));
   }
   UInt cc_op = ARM64G_CC_OP_NUMBER;
   /**/ if ( isSBC &&  is64) { cc_op = ARM64G_CC_OP_SBC64; }
   else if ( isSBC && !is64) { cc_op = ARM64G_CC_OP_SBC32; }
   else if (!isSBC &&  is64) { cc_op = ARM64G_CC_OP_ADC64; }
   else if (!isSBC && !is64) { cc_op = ARM64G_CC_OP_ADC32; }
   else                      { vassert(0); }
   setFlags_D1_D2_ND(cc_op, argL64, argR64, oldC64);
}

/* Build IR to set the flags thunk after ADD or SUB, if the given
   condition evaluates to True at run time.  If not, the flags are set
   to the specified NZCV value. */
static
void setFlags_ADD_SUB_conditionally (
        Bool is64, Bool isSUB,
        IRTemp cond, IRTemp argL, IRTemp argR, UInt nzcv
     )
{
   /* Generate IR as follows:
        CC_OP   = ITE(cond, OP_{ADD,SUB}{32,64}, OP_COPY)
        CC_DEP1 = ITE(cond, argL64, nzcv << 28)
        CC_DEP2 = ITE(cond, argR64, 0)
        CC_NDEP = 0
   */

IRTemp z64 = newTemp(Ity_I64);
   assign(z64, mkU64(0));

/* Establish the operation and operands for the True case. */
   IRTemp t_dep1 = IRTemp_INVALID;
   IRTemp t_dep2 = IRTemp_INVALID;
   UInt   t_op   = ARM64G_CC_OP_NUMBER;
   /**/ if ( isSUB &&  is64) { t_op = ARM64G_CC_OP_SUB64; }
   else if ( isSUB && !is64) { t_op = ARM64G_CC_OP_SUB32; }
   else if (!isSUB &&  is64) { t_op = ARM64G_CC_OP_ADD64; }
   else if (!isSUB && !is64) { t_op = ARM64G_CC_OP_ADD32; }
   else                      { vassert(0); }
   /* */
   if (is64) {
      t_dep1 = argL;
      t_dep2 = argR;
   } else {
      t_dep1 = newTemp(Ity_I64);
      t_dep2 = newTemp(Ity_I64);
      assign(t_dep1, unop(Iop_32Uto64, mkexpr(argL)));
      assign(t_dep2, unop(Iop_32Uto64, mkexpr(argR)));
   }

/* Establish the operation and operands for the False case. */
   IRTemp f_dep1 = newTemp(Ity_I64);
   IRTemp f_dep2 = z64;
   UInt   f_op   = ARM64G_CC_OP_COPY;
   assign(f_dep1, mkU64(nzcv << 28));

/* Final thunk values */
   IRTemp dep1 = newTemp(Ity_I64);
   IRTemp dep2 = newTemp(Ity_I64);
   IRTemp op   = newTemp(Ity_I64);

assign(op,   IRExpr_ITE(mkexpr(cond), mkU64(t_op), mkU64(f_op)));
   assign(dep1, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep1), mkexpr(f_dep1)));
   assign(dep2, IRExpr_ITE(mkexpr(cond), mkexpr(t_dep2), mkexpr(f_dep2)));

/* finally .. */
   stmt( IRStmt_Put( OFFB_CC_OP,   mkexpr(op) ));
   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(dep1) ));
   stmt( IRStmt_Put( OFFB_CC_DEP2, mkexpr(dep2) ));
   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(z64) ));
}

/* Build IR to set the flags thunk after AND/OR/XOR or variants thereof. */
static
void setFlags_LOGIC ( Bool is64, IRTemp res )
{
   IRTemp res64 = IRTemp_INVALID;
   IRTemp z64   = newTemp(Ity_I64);
   UInt   cc_op = ARM64G_CC_OP_NUMBER;
   if (is64) {
      res64 = res;
      cc_op = ARM64G_CC_OP_LOGIC64;
   } else {
      res64 = newTemp(Ity_I64);
      assign(res64, unop(Iop_32Uto64, mkexpr(res)));
      cc_op = ARM64G_CC_OP_LOGIC32;
   }
   assign(z64, mkU64(0));
   setFlags_D1_D2_ND(cc_op, res64, z64, z64);
}

/* Build IR to set the flags thunk to a given NZCV value.  NZCV is
   located in bits 31:28 of the supplied value. */
static
void setFlags_COPY ( IRTemp nzcv_28x0 )
{
   IRTemp z64 = newTemp(Ity_I64);
   assign(z64, mkU64(0));
   setFlags_D1_D2_ND(ARM64G_CC_OP_COPY, nzcv_28x0, z64, z64);
}

//ZZ /* Minor variant of the above that sets NDEP to zero (if it
//ZZ    sets it at all) */
//ZZ static void setFlags_D1_D2 ( UInt cc_op, IRTemp t_dep1,
//ZZ                              IRTemp t_dep2,
//ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
//ZZ {
//ZZ    IRTemp z32 = newTemp(Ity_I32);
//ZZ    assign( z32, mkU32(0) );
//ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, t_dep2, z32, guardT );
//ZZ }
//ZZ 
//ZZ 
//ZZ /* Minor variant of the above that sets DEP2 to zero (if it
//ZZ    sets it at all) */
//ZZ static void setFlags_D1_ND ( UInt cc_op, IRTemp t_dep1,
//ZZ                              IRTemp t_ndep,
//ZZ                              IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
//ZZ {
//ZZ    IRTemp z32 = newTemp(Ity_I32);
//ZZ    assign( z32, mkU32(0) );
//ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, t_ndep, guardT );
//ZZ }
//ZZ 
//ZZ 
//ZZ /* Minor variant of the above that sets DEP2 and NDEP to zero (if it
//ZZ    sets them at all) */
//ZZ static void setFlags_D1 ( UInt cc_op, IRTemp t_dep1,
//ZZ                           IRTemp guardT /* :: Ity_I32, 0 or 1 */ )
//ZZ {
//ZZ    IRTemp z32 = newTemp(Ity_I32);
//ZZ    assign( z32, mkU32(0) );
//ZZ    setFlags_D1_D2_ND( cc_op, t_dep1, z32, z32, guardT );
//ZZ }

/*------------------------------------------------------------*/
/*--- Misc math helpers                                    ---*/
/*------------------------------------------------------------*/

/* Generate IR for ((x & mask) >>u sh) | ((x << sh) & mask) */
static IRTemp math_SWAPHELPER ( IRTemp x, ULong mask, Int sh )
{
   IRTemp maskT = newTemp(Ity_I64);
   IRTemp res   = newTemp(Ity_I64);
   vassert(sh >= 1 && sh <= 63);
   assign(maskT, mkU64(mask));
   assign( res,
           binop(Iop_Or64,
                 binop(Iop_Shr64,
                       binop(Iop_And64,mkexpr(x),mkexpr(maskT)),
                       mkU8(sh)),
                 binop(Iop_And64,
                       binop(Iop_Shl64,mkexpr(x),mkU8(sh)),
                       mkexpr(maskT))
                 ) 
           );
   return res;
}

/* Generates byte swaps within 32-bit lanes. */
static IRTemp math_UINTSWAP64 ( IRTemp src )
{
   IRTemp res;
   res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
   res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
   return res;
}

/* Generates byte swaps within 16-bit lanes. */
static IRTemp math_USHORTSWAP64 ( IRTemp src )
{
   IRTemp res;
   res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
   return res;
}

/* Generates a 64-bit byte swap. */
static IRTemp math_BYTESWAP64 ( IRTemp src )
{
   IRTemp res;
   res = math_SWAPHELPER(src, 0xFF00FF00FF00FF00ULL, 8);
   res = math_SWAPHELPER(res, 0xFFFF0000FFFF0000ULL, 16);
   res = math_SWAPHELPER(res, 0xFFFFFFFF00000000ULL, 32);
   return res;
}

/* Generates a 64-bit bit swap. */
static IRTemp math_BITSWAP64 ( IRTemp src )
{
   IRTemp res;
   res = math_SWAPHELPER(src, 0xAAAAAAAAAAAAAAAAULL, 1);
   res = math_SWAPHELPER(res, 0xCCCCCCCCCCCCCCCCULL, 2);
   res = math_SWAPHELPER(res, 0xF0F0F0F0F0F0F0F0ULL, 4);
   return math_BYTESWAP64(res);
}

/* Duplicates the bits at the bottom of the given word to fill the
   whole word.  src :: Ity_I64 is assumed to have zeroes everywhere
   except for the bottom bits. */
static IRTemp math_DUP_TO_64 ( IRTemp src, IRType srcTy )
{
   if (srcTy == Ity_I8) {
      IRTemp t16 = newTemp(Ity_I64);
      assign(t16, binop(Iop_Or64, mkexpr(src),
                                  binop(Iop_Shl64, mkexpr(src), mkU8(8))));
      IRTemp t32 = newTemp(Ity_I64);
      assign(t32, binop(Iop_Or64, mkexpr(t16),
                                  binop(Iop_Shl64, mkexpr(t16), mkU8(16))));
      IRTemp t64 = newTemp(Ity_I64);
      assign(t64, binop(Iop_Or64, mkexpr(t32),
                                  binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
      return t64;
   }
   if (srcTy == Ity_I16) {
      IRTemp t32 = newTemp(Ity_I64);
      assign(t32, binop(Iop_Or64, mkexpr(src),
                                  binop(Iop_Shl64, mkexpr(src), mkU8(16))));
      IRTemp t64 = newTemp(Ity_I64);
      assign(t64, binop(Iop_Or64, mkexpr(t32),
                                  binop(Iop_Shl64, mkexpr(t32), mkU8(32))));
      return t64;
   }
   if (srcTy == Ity_I32) {
      IRTemp t64 = newTemp(Ity_I64);
      assign(t64, binop(Iop_Or64, mkexpr(src),
                                  binop(Iop_Shl64, mkexpr(src), mkU8(32))));
      return t64;
   }
   if (srcTy == Ity_I64) {
      return src;
   }
   vassert(0);
}

/* Duplicates the src element exactly so as to fill a V128 value. */
static IRTemp math_DUP_TO_V128 ( IRTemp src, IRType srcTy )
{
   IRTemp res = newTempV128();
   if (srcTy == Ity_F64) {
      IRTemp i64 = newTemp(Ity_I64);
      assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(src)));
      assign(res, binop(Iop_64HLtoV128, mkexpr(i64), mkexpr(i64)));
      return res;
   }
   if (srcTy == Ity_F32) {
      IRTemp i64a = newTemp(Ity_I64);
      assign(i64a, unop(Iop_32Uto64, unop(Iop_ReinterpF32asI32, mkexpr(src))));
      IRTemp i64b = newTemp(Ity_I64);
      assign(i64b, binop(Iop_Or64, binop(Iop_Shl64, mkexpr(i64a), mkU8(32)),
                                   mkexpr(i64a)));
      assign(res, binop(Iop_64HLtoV128, mkexpr(i64b), mkexpr(i64b)));
      return res;
   }
   if (srcTy == Ity_I64) {
      assign(res, binop(Iop_64HLtoV128, mkexpr(src), mkexpr(src)));
      return res;
   }
   if (srcTy == Ity_I32 || srcTy == Ity_I16 || srcTy == Ity_I8) {
      IRTemp t1 = newTemp(Ity_I64);
      assign(t1, widenUto64(srcTy, mkexpr(src)));
      IRTemp t2 = math_DUP_TO_64(t1, srcTy);
      assign(res, binop(Iop_64HLtoV128, mkexpr(t2), mkexpr(t2)));
      return res;
   }
   vassert(0);
}

/* |fullWidth| is a full V128 width result.  Depending on bitQ,
   zero out the upper half. */
static IRExpr* math_MAYBE_ZERO_HI64 ( UInt bitQ, IRTemp fullWidth )
{
   if (bitQ == 1) return mkexpr(fullWidth);
   if (bitQ == 0) return unop(Iop_ZeroHI64ofV128, mkexpr(fullWidth));
   vassert(0);
}

/* The same, but from an expression instead. */
static IRExpr* math_MAYBE_ZERO_HI64_fromE ( UInt bitQ, IRExpr* fullWidth )
{
   IRTemp fullWidthT = newTempV128();
   assign(fullWidthT, fullWidth);
   return math_MAYBE_ZERO_HI64(bitQ, fullWidthT);
}

/*------------------------------------------------------------*/
/*--- FP comparison helpers                                ---*/
/*------------------------------------------------------------*/

/* irRes :: Ity_I32 holds a floating point comparison result encoded
   as an IRCmpF64Result.  Generate code to convert it to an
   ARM64-encoded (N,Z,C,V) group in the lowest 4 bits of an I64 value.
   Assign a new temp to hold that value, and return the temp. */
static
IRTemp mk_convert_IRCmpF64Result_to_NZCV ( IRTemp irRes32 )
{
   IRTemp ix       = newTemp(Ity_I64);
   IRTemp termL    = newTemp(Ity_I64);
   IRTemp termR    = newTemp(Ity_I64);
   IRTemp nzcv     = newTemp(Ity_I64);
   IRTemp irRes    = newTemp(Ity_I64);

/* This is where the fun starts.  We have to convert 'irRes' from
      an IR-convention return result (IRCmpF64Result) to an
      ARM-encoded (N,Z,C,V) group.  The final result is in the bottom
      4 bits of 'nzcv'. */
   /* Map compare result from IR to ARM(nzcv) */
   /*
      FP cmp result | IR   | ARM(nzcv)
      --------------------------------
      UN              0x45   0011
      LT              0x01   1000
      GT              0x00   0010
      EQ              0x40   0110
   */
   /* Now since you're probably wondering WTF ..

ix fishes the useful bits out of the IR value, bits 6 and 0, and
      places them side by side, giving a number which is 0, 1, 2 or 3.

termL is a sequence cooked up by GNU superopt.  It converts ix
         into an almost correct value NZCV value (incredibly), except
         for the case of UN, where it produces 0100 instead of the
         required 0011.

termR is therefore a correction term, also computed from ix.  It
         is 1 in the UN case and 0 for LT, GT and UN.  Hence, to get
         the final correct value, we subtract termR from termL.

Don't take my word for it.  There's a test program at the bottom
      of guest_arm_toIR.c, to try this out with.
   */
   assign(irRes, unop(Iop_32Uto64, mkexpr(irRes32)));

assign(
      ix,
      binop(Iop_Or64,
            binop(Iop_And64,
                  binop(Iop_Shr64, mkexpr(irRes), mkU8(5)),
                  mkU64(3)),
            binop(Iop_And64, mkexpr(irRes), mkU64(1))));

assign(
      termL,
      binop(Iop_Add64,
            binop(Iop_Shr64,
                  binop(Iop_Sub64,
                        binop(Iop_Shl64,
                              binop(Iop_Xor64, mkexpr(ix), mkU64(1)),
                              mkU8(62)),
                        mkU64(1)),
                  mkU8(61)),
            mkU64(1)));

assign(
      termR,
      binop(Iop_And64,
            binop(Iop_And64,
                  mkexpr(ix),
                  binop(Iop_Shr64, mkexpr(ix), mkU8(1))),
            mkU64(1)));

assign(nzcv, binop(Iop_Sub64, mkexpr(termL), mkexpr(termR)));
   return nzcv;
}

/*------------------------------------------------------------*/
/*--- Data processing (immediate)                          ---*/
/*------------------------------------------------------------*/

/* Helper functions for supporting "DecodeBitMasks" */

static ULong dbm_ROR ( Int width, ULong x, Int rot )
{
   vassert(width > 0 && width <= 64);
   vassert(rot >= 0 && rot < width);
   if (rot == 0) return x;
   ULong res = x >> rot;
   res |= (x << (width - rot));
   if (width < 64)
     res &= ((1ULL << width) - 1);
   return res;
}

static ULong dbm_RepTo64( Int esize, ULong x )
{
   switch (esize) {
      case 64:
         return x;
      case 32:
         x &= 0xFFFFFFFF; x |= (x << 32);
         return x;
      case 16:
         x &= 0xFFFF; x |= (x << 16); x |= (x << 32);
         return x;
      case 8:
         x &= 0xFF; x |= (x << 8); x |= (x << 16); x |= (x << 32);
         return x;
      case 4:
         x &= 0xF; x |= (x << 4); x |= (x << 8);
         x |= (x << 16); x |= (x << 32);
         return x;
      case 2:
         x &= 0x3; x |= (x << 2); x |= (x << 4); x |= (x << 8);
         x |= (x << 16); x |= (x << 32);
         return x;
      default:
         break;
   }
   vpanic("dbm_RepTo64");
   /*NOTREACHED*/
   return 0;
}

static Int dbm_highestSetBit ( ULong x )
{
   Int i;
   for (i = 63; i >= 0; i--) {
      if (x & (1ULL << i))
         return i;
   }
   vassert(x == 0);
   return -1;
}

static
Bool dbm_DecodeBitMasks ( /*OUT*/ULong* wmask, /*OUT*/ULong* tmask, 
                          ULong immN, ULong imms, ULong immr, Bool immediate,
                          UInt M /*32 or 64*/)
{
   vassert(immN < (1ULL << 1));
   vassert(imms < (1ULL << 6));
   vassert(immr < (1ULL << 6));
   vassert(immediate == False || immediate == True);
   vassert(M == 32 || M == 64);

Int len = dbm_highestSetBit( ((immN << 6) & 64) | ((~imms) & 63) );
   if (len < 1) { /* printf("fail1\n"); */ return False; }
   vassert(len <= 6);
   vassert(M >= (1 << len));

vassert(len >= 1 && len <= 6);
   ULong levels = // (zeroes(6 - len) << (6-len)) | ones(len);
                  (1 << len) - 1;
   vassert(levels >= 1 && levels <= 63);

if (immediate && ((imms & levels) == levels)) { 
      /* printf("fail2 imms %llu levels %llu len %d\n", imms, levels, len); */
      return False;
   }

ULong S = imms & levels;
   ULong R = immr & levels;
   Int   diff = S - R;
   diff &= 63;
   Int esize = 1 << len;
   vassert(2 <= esize && esize <= 64);

/* Be careful of these (1ULL << (S+1)) - 1 expressions, and the
      same below with d.  S can be 63 in which case we have an out of
      range and hence undefined shift. */
   vassert(S >= 0 && S <= 63);
   vassert(esize >= (S+1));
   ULong elem_s = // Zeroes(esize-(S+1)):Ones(S+1)
                  //(1ULL << (S+1)) - 1;
                  ((1ULL << S) - 1) + (1ULL << S);

Int d = // diff<len-1:0>
           diff & ((1 << len)-1);
   vassert(esize >= (d+1));
   vassert(d >= 0 && d <= 63);

ULong elem_d = // Zeroes(esize-(d+1)):Ones(d+1)
                  //(1ULL << (d+1)) - 1;
                  ((1ULL << d) - 1) + (1ULL << d);

if (esize != 64) vassert(elem_s < (1ULL << esize));
   if (esize != 64) vassert(elem_d < (1ULL << esize));

if (wmask) *wmask = dbm_RepTo64(esize, dbm_ROR(esize, elem_s, R));
   if (tmask) *tmask = dbm_RepTo64(esize, elem_d);

return True;
}

static
Bool dis_ARM64_data_processing_immediate(/*MB_OUT*/DisResult* dres,
                                         UInt insn)
{
#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))

/* insn[28:23]
      10000x PC-rel addressing
      10001x Add/subtract (immediate)
      100100 Logical (immediate)
      100101 Move Wide (immediate)
      100110 Bitfield
      100111 Extract
   */

/* ------------------ ADD/SUB{,S} imm12 ------------------ */
   if (INSN(28,24) == BITS5(1,0,0,0,1)) {
      Bool is64   = INSN(31,31) == 1;
      Bool isSub  = INSN(30,30) == 1;
      Bool setCC  = INSN(29,29) == 1;
      UInt sh     = INSN(23,22);
      UInt uimm12 = INSN(21,10);
      UInt nn     = INSN(9,5);
      UInt dd     = INSN(4,0);
      const HChar* nm = isSub ? "sub" : "add";
      if (sh >= 2) {
         /* Invalid; fall through */
      } else {
         vassert(sh <= 1);
         uimm12 <<= (12 * sh);
         if (is64) {
            IRTemp argL  = newTemp(Ity_I64);
            IRTemp argR  = newTemp(Ity_I64);
            IRTemp res   = newTemp(Ity_I64);
            assign(argL, getIReg64orSP(nn));
            assign(argR, mkU64(uimm12));
            assign(res,  binop(isSub ? Iop_Sub64 : Iop_Add64,
                               mkexpr(argL), mkexpr(argR)));
            if (setCC) {
               putIReg64orZR(dd, mkexpr(res));
               setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
               DIP("%ss %s, %s, 0x%x\n",
                   nm, nameIReg64orZR(dd), nameIReg64orSP(nn), uimm12);
            } else {
               putIReg64orSP(dd, mkexpr(res));
               DIP("%s %s, %s, 0x%x\n",
                   nm, nameIReg64orSP(dd), nameIReg64orSP(nn), uimm12);
            }
         } else {
            IRTemp argL  = newTemp(Ity_I32);
            IRTemp argR  = newTemp(Ity_I32);
            IRTemp res   = newTemp(Ity_I32);
            assign(argL, getIReg32orSP(nn));
            assign(argR, mkU32(uimm12));
            assign(res,  binop(isSub ? Iop_Sub32 : Iop_Add32,
                               mkexpr(argL), mkexpr(argR)));
            if (setCC) {
               putIReg32orZR(dd, mkexpr(res));
               setFlags_ADD_SUB(False/*!is64*/, isSub, argL, argR);
               DIP("%ss %s, %s, 0x%x\n",
                   nm, nameIReg32orZR(dd), nameIReg32orSP(nn), uimm12);
            } else {
               putIReg32orSP(dd, mkexpr(res));
               DIP("%s %s, %s, 0x%x\n",
                   nm, nameIReg32orSP(dd), nameIReg32orSP(nn), uimm12);
            }
         }
         return True;
      }
   }

/* -------------------- ADR/ADRP -------------------- */
   if (INSN(28,24) == BITS5(1,0,0,0,0)) {
      UInt  bP    = INSN(31,31);
      UInt  immLo = INSN(30,29);
      UInt  immHi = INSN(23,5);
      UInt  rD    = INSN(4,0);
      ULong uimm  = (immHi << 2) | immLo;
      ULong simm  = sx_to_64(uimm, 21);
      ULong val;
      if (bP) {
         val = (guest_PC_curr_instr & 0xFFFFFFFFFFFFF000ULL) + (simm << 12);
      } else {
         val = guest_PC_curr_instr + simm;
      }
      putIReg64orZR(rD, mkU64(val));
      DIP("adr%s %s, 0x%llx\n", bP ? "p" : "", nameIReg64orZR(rD), val);
      return True;
   }

/* -------------------- LOGIC(imm) -------------------- */
   if (INSN(28,23) == BITS6(1,0,0,1,0,0)) {
      /* 31 30 28     22 21   15   9  4
         sf op 100100 N  immr imms Rn Rd
           op=00: AND  Rd|SP, Rn, #imm
           op=01: ORR  Rd|SP, Rn, #imm
           op=10: EOR  Rd|SP, Rn, #imm
           op=11: ANDS Rd|ZR, Rn, #imm
      */
      Bool  is64 = INSN(31,31) == 1;
      UInt  op   = INSN(30,29);
      UInt  N    = INSN(22,22);
      UInt  immR = INSN(21,16);
      UInt  immS = INSN(15,10);
      UInt  nn   = INSN(9,5);
      UInt  dd   = INSN(4,0);
      ULong imm  = 0;
      Bool  ok;
      if (N == 1 && !is64) 
         goto after_logic_imm; /* not allowed; fall through */
      ok = dbm_DecodeBitMasks(&imm, NULL,
                              N, immS, immR, True, is64 ? 64 : 32);
      if (!ok)
         goto after_logic_imm;

const HChar* names[4] = { "and", "orr", "eor", "ands" };
      const IROp   ops64[4] = { Iop_And64, Iop_Or64, Iop_Xor64, Iop_And64 };
      const IROp   ops32[4] = { Iop_And32, Iop_Or32, Iop_Xor32, Iop_And32 };

vassert(op < 4);
      if (is64) {
         IRExpr* argL = getIReg64orZR(nn);
         IRExpr* argR = mkU64(imm);
         IRTemp  res  = newTemp(Ity_I64);
         assign(res, binop(ops64[op], argL, argR));
         if (op < 3) {
            putIReg64orSP(dd, mkexpr(res));
            DIP("%s %s, %s, 0x%llx\n", names[op],
                nameIReg64orSP(dd), nameIReg64orZR(nn), imm);
         } else {
            putIReg64orZR(dd, mkexpr(res));
            setFlags_LOGIC(True/*is64*/, res);
            DIP("%s %s, %s, 0x%llx\n", names[op],
                nameIReg64orZR(dd), nameIReg64orZR(nn), imm);
         }
      } else {
         IRExpr* argL = getIReg32orZR(nn);
         IRExpr* argR = mkU32((UInt)imm);
         IRTemp  res  = newTemp(Ity_I32);
         assign(res, binop(ops32[op], argL, argR));
         if (op < 3) {
            putIReg32orSP(dd, mkexpr(res));
            DIP("%s %s, %s, 0x%x\n", names[op],
                nameIReg32orSP(dd), nameIReg32orZR(nn), (UInt)imm);
         } else {
            putIReg32orZR(dd, mkexpr(res));
            setFlags_LOGIC(False/*!is64*/, res);
            DIP("%s %s, %s, 0x%x\n", names[op],
                nameIReg32orZR(dd), nameIReg32orZR(nn), (UInt)imm);
         }
      }
      return True;
   }
   after_logic_imm:

/* -------------------- MOV{Z,N,K} -------------------- */
   if (INSN(28,23) == BITS6(1,0,0,1,0,1)) {
      /* 31 30 28      22 20    4
         |  |  |       |  |     |
         sf 10 100 101 hw imm16 Rd   MOV(Z) Rd, (imm16 << (16*hw))
         sf 00 100 101 hw imm16 Rd   MOV(N) Rd, ~(imm16 << (16*hw))
         sf 11 100 101 hw imm16 Rd   MOV(K) Rd, (imm16 << (16*hw))
      */
      Bool is64   = INSN(31,31) == 1;
      UInt subopc = INSN(30,29);
      UInt hw     = INSN(22,21);
      UInt imm16  = INSN(20,5);
      UInt dd     = INSN(4,0);
      if (subopc == BITS2(0,1) || (!is64 && hw >= 2)) {
         /* invalid; fall through */
      } else {
         ULong imm64 = ((ULong)imm16) << (16 * hw);
         if (!is64)
            vassert(imm64 < 0x100000000ULL);
         switch (subopc) {
            case BITS2(1,0): // MOVZ
               putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
               DIP("movz %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
               break;
            case BITS2(0,0): // MOVN
               imm64 = ~imm64;
               if (!is64)
                  imm64 &= 0xFFFFFFFFULL;
               putIRegOrZR(is64, dd, is64 ? mkU64(imm64) : mkU32((UInt)imm64));
               DIP("movn %s, 0x%llx\n", nameIRegOrZR(is64, dd), imm64);
               break;
            case BITS2(1,1): // MOVK
               /* This is more complex.  We are inserting a slice into
                  the destination register, so we need to have the old
                  value of it. */
               if (is64) {
                  IRTemp old = newTemp(Ity_I64);
                  assign(old, getIReg64orZR(dd));
                  ULong mask = 0xFFFFULL << (16 * hw);
                  IRExpr* res
                     = binop(Iop_Or64, 
                             binop(Iop_And64, mkexpr(old), mkU64(~mask)),
                             mkU64(imm64));
                  putIReg64orZR(dd, res);
                  DIP("movk %s, 0x%x, lsl %u\n",
                      nameIReg64orZR(dd), imm16, 16*hw);
               } else {
                  IRTemp old = newTemp(Ity_I32);
                  assign(old, getIReg32orZR(dd));
                  vassert(hw <= 1);
                  UInt mask = 0xFFFF << (16 * hw);
                  IRExpr* res
                     = binop(Iop_Or32, 
                             binop(Iop_And32, mkexpr(old), mkU32(~mask)),
                             mkU32((UInt)imm64));
                  putIReg32orZR(dd, res);
                  DIP("movk %s, 0x%x, lsl %u\n",
                      nameIReg32orZR(dd), imm16, 16*hw);
               }
               break;
            default:
               vassert(0);
         }
         return True;
      }
   }

/* -------------------- {U,S,}BFM -------------------- */
   /*    30 28     22 21   15   9  4

sf 10 100110 N  immr imms nn dd
         UBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
         UBFM Xd, Xn, #immr, #imms   when sf=1, N=1

sf 00 100110 N  immr imms nn dd
         SBFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
         SBFM Xd, Xn, #immr, #imms   when sf=1, N=1

sf 01 100110 N  immr imms nn dd
         BFM Wd, Wn, #immr, #imms   when sf=0, N=0, immr[5]=0, imms[5]=0
         BFM Xd, Xn, #immr, #imms   when sf=1, N=1
   */
   if (INSN(28,23) == BITS6(1,0,0,1,1,0)) {
      UInt sf     = INSN(31,31);
      UInt opc    = INSN(30,29);
      UInt N      = INSN(22,22);
      UInt immR   = INSN(21,16);
      UInt immS   = INSN(15,10);
      UInt nn     = INSN(9,5);
      UInt dd     = INSN(4,0);
      Bool inZero = False;
      Bool extend = False;
      const HChar* nm = "???";
      /* skip invalid combinations */
      switch (opc) {
         case BITS2(0,0):
            inZero = True; extend = True; nm = "sbfm"; break;
         case BITS2(0,1):
            inZero = False; extend = False; nm = "bfm"; break;
         case BITS2(1,0):
            inZero = True; extend = False; nm = "ubfm"; break;
         case BITS2(1,1):
            goto after_bfm; /* invalid */
         default:
            vassert(0);
      }
      if (sf == 1 && N != 1) goto after_bfm;
      if (sf == 0 && (N != 0 || ((immR >> 5) & 1) != 0
                             || ((immS >> 5) & 1) != 0)) goto after_bfm;
      ULong wmask = 0, tmask = 0;
      Bool ok = dbm_DecodeBitMasks(&wmask, &tmask,
                                   N, immS, immR, False, sf == 1 ? 64 : 32);
      if (!ok) goto after_bfm; /* hmmm */

Bool   is64 = sf == 1;
      IRType ty   = is64 ? Ity_I64 : Ity_I32;

IRTemp dst = newTemp(ty);
      IRTemp src = newTemp(ty);
      IRTemp bot = newTemp(ty);
      IRTemp top = newTemp(ty);
      IRTemp res = newTemp(ty);
      assign(dst, inZero ? mkU(ty,0) : getIRegOrZR(is64, dd));
      assign(src, getIRegOrZR(is64, nn));
      /* perform bitfield move on low bits */
      assign(bot, binop(mkOR(ty),
                        binop(mkAND(ty), mkexpr(dst), mkU(ty, ~wmask)),
                        binop(mkAND(ty), mkexpr(mathROR(ty, src, immR)),
                                         mkU(ty, wmask))));
      /* determine extension bits (sign, zero or dest register) */
      assign(top, mkexpr(extend ? mathREPLICATE(ty, src, immS) : dst));
      /* combine extension bits and result bits */
      assign(res, binop(mkOR(ty),
                        binop(mkAND(ty), mkexpr(top), mkU(ty, ~tmask)),
                        binop(mkAND(ty), mkexpr(bot), mkU(ty, tmask))));
      putIRegOrZR(is64, dd, mkexpr(res));
      DIP("%s %s, %s, immR=%u, immS=%u\n",
          nm, nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn), immR, immS);
      return True;
   }
   after_bfm:

/* ---------------------- EXTR ---------------------- */
   /*   30 28     22 20 15   9 4
      1 00 100111 10 m  imm6 n d  EXTR Xd, Xn, Xm, #imm6
      0 00 100111 00 m  imm6 n d  EXTR Wd, Wn, Wm, #imm6 when #imm6 < 32
   */
   if (INSN(30,23) == BITS8(0,0,1,0,0,1,1,1) && INSN(21,21) == 0) {
      Bool is64  = INSN(31,31) == 1;
      UInt mm    = INSN(20,16);
      UInt imm6  = INSN(15,10);
      UInt nn    = INSN(9,5);
      UInt dd    = INSN(4,0);
      Bool valid = True;
      if (INSN(31,31) != INSN(22,22))
        valid = False;
      if (!is64 && imm6 >= 32)
        valid = False;
      if (!valid) goto after_extr;
      IRType ty    = is64 ? Ity_I64 : Ity_I32;
      IRTemp srcHi = newTemp(ty);
      IRTemp srcLo = newTemp(ty);
      IRTemp res   = newTemp(ty);
      assign(srcHi, getIRegOrZR(is64, nn));
      assign(srcLo, getIRegOrZR(is64, mm));
      if (imm6 == 0) {
        assign(res, mkexpr(srcLo));
      } else {
        UInt szBits = 8 * sizeofIRType(ty);
        vassert(imm6 > 0 && imm6 < szBits);
        assign(res, binop(mkOR(ty),
                          binop(mkSHL(ty), mkexpr(srcHi), mkU8(szBits-imm6)),
                          binop(mkSHR(ty), mkexpr(srcLo), mkU8(imm6))));
      }
      putIRegOrZR(is64, dd, mkexpr(res));
      DIP("extr %s, %s, %s, #%u\n",
          nameIRegOrZR(is64,dd),
          nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm), imm6);
      return True;
   }
  after_extr:

vex_printf("ARM64 front end: data_processing_immediate\n");
   return False;
#  undef INSN
}

/*------------------------------------------------------------*/
/*--- Data processing (register) instructions              ---*/
/*------------------------------------------------------------*/

static const HChar* nameSH ( UInt sh ) {
   switch (sh) {
      case 0: return "lsl";
      case 1: return "lsr";
      case 2: return "asr";
      case 3: return "ror";
      default: vassert(0);
   }
}

/* Generate IR to get a register value, possibly shifted by an
   immediate.  Returns either a 32- or 64-bit temporary holding the
   result.  After the shift, the value can optionally be NOT-ed 
   too.

sh_how coding: 00=SHL, 01=SHR, 10=SAR, 11=ROR.  sh_amt may only be
   in the range 0 to (is64 ? 64 : 32)-1.  For some instructions, ROR
   isn't allowed, but it's the job of the caller to check that.
*/
static IRTemp getShiftedIRegOrZR ( Bool is64,
                                   UInt sh_how, UInt sh_amt, UInt regNo,
                                   Bool invert )
{
   vassert(sh_how < 4);
   vassert(sh_amt < (is64 ? 64 : 32));
   IRType ty = is64 ? Ity_I64 : Ity_I32;
   IRTemp t0 = newTemp(ty);
   assign(t0, getIRegOrZR(is64, regNo));
   IRTemp t1 = newTemp(ty);
   switch (sh_how) {
      case BITS2(0,0):
         assign(t1, binop(mkSHL(ty), mkexpr(t0), mkU8(sh_amt)));
         break;
      case BITS2(0,1):
         assign(t1, binop(mkSHR(ty), mkexpr(t0), mkU8(sh_amt)));
         break;
      case BITS2(1,0):
         assign(t1, binop(mkSAR(ty), mkexpr(t0), mkU8(sh_amt)));
         break;
      case BITS2(1,1):
         assign(t1, mkexpr(mathROR(ty, t0, sh_amt)));
         break;
      default:
         vassert(0);
   }
   if (invert) {
      IRTemp t2 = newTemp(ty);
      assign(t2, unop(mkNOT(ty), mkexpr(t1)));
      return t2;
   } else {
      return t1;
   }
}

static
Bool dis_ARM64_data_processing_register(/*MB_OUT*/DisResult* dres,
                                        UInt insn)
{
#  define INSN(_bMax,_bMin)  SLICE_UInt(insn, (_bMax), (_bMin))

/* ------------------- ADD/SUB(reg) ------------------- */
   /* x==0 => 32 bit op      x==1 => 64 bit op
      sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR(NOT ALLOWED)

31 30 29 28    23 21 20 15   9  4
      |  |  |  |     |  |  |  |    |  |
      x  0  0  01011 sh 0  Rm imm6 Rn Rd   ADD  Rd,Rn, sh(Rm,imm6)
      x  0  1  01011 sh 0  Rm imm6 Rn Rd   ADDS Rd,Rn, sh(Rm,imm6)
      x  1  0  01011 sh 0  Rm imm6 Rn Rd   SUB  Rd,Rn, sh(Rm,imm6)
      x  1  1  01011 sh 0  Rm imm6 Rn Rd   SUBS Rd,Rn, sh(Rm,imm6)
   */
   if (INSN(28,24) == BITS5(0,1,0,1,1) && INSN(21,21) == 0) {
      UInt   bX    = INSN(31,31);
      UInt   bOP   = INSN(30,30); /* 0: ADD, 1: SUB */
      UInt   bS    = INSN(29, 29); /* set flags? */
      UInt   sh    = INSN(23,22);
      UInt   rM    = INSN(20,16);
      UInt   imm6  = INSN(15,10);
      UInt   rN    = INSN(9,5);
      UInt   rD    = INSN(4,0);
      Bool   isSUB = bOP == 1;
      Bool   is64  = bX == 1;
      IRType ty    = is64 ? Ity_I64 : Ity_I32;
      if ((!is64 && imm6 > 31) || sh == BITS2(1,1)) {
         /* invalid; fall through */
      } else {
         IRTemp argL = newTemp(ty);
         assign(argL, getIRegOrZR(is64, rN));
         IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, False);
         IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
         IRTemp res  = newTemp(ty);
         assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
         if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));
         if (bS) {
            setFlags_ADD_SUB(is64, isSUB, argL, argR);
         }
         DIP("%s%s %s, %s, %s, %s #%u\n",
             bOP ? "sub" : "add", bS ? "s" : "",
             nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
             nameIRegOrZR(is64, rM), nameSH(sh), imm6);
         return True;
      }
   }

/* ------------------- ADC/SBC(reg) ------------------- */
   /* x==0 => 32 bit op      x==1 => 64 bit op

31 30 29 28    23 21 20 15     9  4
      |  |  |  |     |  |  |  |      |  |
      x  0  0  11010 00 0  Rm 000000 Rn Rd   ADC  Rd,Rn,Rm
      x  0  1  11010 00 0  Rm 000000 Rn Rd   ADCS Rd,Rn,Rm
      x  1  0  11010 00 0  Rm 000000 Rn Rd   SBC  Rd,Rn,Rm
      x  1  1  11010 00 0  Rm 000000 Rn Rd   SBCS Rd,Rn,Rm
   */

if (INSN(28,21) == BITS8(1,1,0,1,0,0,0,0) && INSN(15,10) == 0 ) {
      UInt   bX    = INSN(31,31);
      UInt   bOP   = INSN(30,30); /* 0: ADC, 1: SBC */
      UInt   bS    = INSN(29,29); /* set flags */
      UInt   rM    = INSN(20,16);
      UInt   rN    = INSN(9,5);
      UInt   rD    = INSN(4,0);

Bool   isSUB = bOP == 1;
      Bool   is64  = bX == 1;
      IRType ty    = is64 ? Ity_I64 : Ity_I32;

IRTemp oldC = newTemp(ty);
      assign(oldC,
             is64 ? mk_arm64g_calculate_flag_c()
                  : unop(Iop_64to32, mk_arm64g_calculate_flag_c()) );

IRTemp argL = newTemp(ty);
      assign(argL, getIRegOrZR(is64, rN));
      IRTemp argR = newTemp(ty);
      assign(argR, getIRegOrZR(is64, rM));

IROp   op   = isSUB ? mkSUB(ty) : mkADD(ty);
      IRTemp res  = newTemp(ty);
      if (isSUB) {
         IRExpr* one = is64 ? mkU64(1) : mkU32(1);
         IROp xorOp = is64 ? Iop_Xor64 : Iop_Xor32;
         assign(res,
                binop(op,
                      binop(op, mkexpr(argL), mkexpr(argR)),
                      binop(xorOp, mkexpr(oldC), one)));
      } else {
         assign(res,
                binop(op,
                      binop(op, mkexpr(argL), mkexpr(argR)),
                      mkexpr(oldC)));
      }

if (rD != 31) putIRegOrZR(is64, rD, mkexpr(res));

if (bS) {
         setFlags_ADC_SBC(is64, isSUB, argL, argR, oldC);
      }

DIP("%s%s %s, %s, %s\n",
          bOP ? "sbc" : "adc", bS ? "s" : "",
          nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
          nameIRegOrZR(is64, rM));
      return True;
   }

/* -------------------- LOGIC(reg) -------------------- */   
   /* x==0 => 32 bit op      x==1 => 64 bit op
      N==0 => inv? is no-op (no inversion)
      N==1 => inv? is NOT
      sh: 00=LSL, 01=LSR, 10=ASR, 11=ROR

31 30 28    23 21 20 15   9  4
      |  |  |     |  |  |  |    |  |
      x  00 01010 sh N  Rm imm6 Rn Rd  AND  Rd,Rn, inv?(sh(Rm,imm6))
      x  01 01010 sh N  Rm imm6 Rn Rd  ORR  Rd,Rn, inv?(sh(Rm,imm6))
      x  10 01010 sh N  Rm imm6 Rn Rd  EOR  Rd,Rn, inv?(sh(Rm,imm6))
      x  11 01010 sh N  Rm imm6 Rn Rd  ANDS Rd,Rn, inv?(sh(Rm,imm6))
      With N=1, the names are: BIC ORN EON BICS
   */
   if (INSN(28,24) == BITS5(0,1,0,1,0)) {
      UInt   bX   = INSN(31,31);
      UInt   sh   = INSN(23,22);
      UInt   bN   = INSN(21,21);
      UInt   rM   = INSN(20,16);
      UInt   imm6 = INSN(15,10);
      UInt   rN   = INSN(9,5);
      UInt   rD   = INSN(4,0);
      Bool   is64 = bX == 1;
      IRType ty   = is64 ? Ity_I64 : Ity_I32;
      if (!is64 && imm6 > 31) {
         /* invalid; fall though */
      } else {
         IRTemp argL = newTemp(ty);
         assign(argL, getIRegOrZR(is64, rN));
         IRTemp argR = getShiftedIRegOrZR(is64, sh, imm6, rM, bN == 1);
         IROp   op   = Iop_INVALID;
         switch (INSN(30,29)) {
            case BITS2(0,0): case BITS2(1,1): op = mkAND(ty); break;
            case BITS2(0,1):                  op = mkOR(ty);  break;
            case BITS2(1,0):                  op = mkXOR(ty); break;
            default: vassert(0);
         }
         IRTemp res = newTemp(ty);
         assign(res, binop(op, mkexpr(argL), mkexpr(argR)));
         if (INSN(30,29) == BITS2(1,1)) {
            setFlags_LOGIC(is64, res);
         }
         putIRegOrZR(is64, rD, mkexpr(res));

static const HChar* names_op[8]
            = { "and", "orr", "eor", "ands", "bic", "orn", "eon", "bics" };
         vassert(((bN << 2) | INSN(30,29)) < 8);
         const HChar* nm_op = names_op[(bN << 2) | INSN(30,29)];
         /* Special-case the printing of "MOV" */
         if (rN == 31/*zr*/ && sh == 0/*LSL*/ && imm6 == 0 && bN == 0) {
            DIP("mov %s, %s\n", nameIRegOrZR(is64, rD),
                                nameIRegOrZR(is64, rM));
         } else {
            DIP("%s %s, %s, %s, %s #%u\n", nm_op,
                nameIRegOrZR(is64, rD), nameIRegOrZR(is64, rN),
                nameIRegOrZR(is64, rM), nameSH(sh), imm6);
         }
         return True;
      }
   }

/* -------------------- {U,S}MULH -------------------- */   
   /* 31       23 22 20 15     9   4
      10011011 1  10 Rm 011111 Rn Rd   UMULH Xd,Xn,Xm
      10011011 0  10 Rm 011111 Rn Rd   SMULH Xd,Xn,Xm
   */
   if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1)
       && INSN(22,21) == BITS2(1,0) && INSN(15,10) == BITS6(0,1,1,1,1,1)) {
      Bool isU = INSN(23,23) == 1;
      UInt mm  = INSN(20,16);
      UInt nn  = INSN(9,5);
      UInt dd  = INSN(4,0);
      putIReg64orZR(dd, unop(Iop_128HIto64,
                             binop(isU ? Iop_MullU64 : Iop_MullS64,
                                   getIReg64orZR(nn), getIReg64orZR(mm))));
      DIP("%cmulh %s, %s, %s\n", 
          isU ? 'u' : 's',
          nameIReg64orZR(dd), nameIReg64orZR(nn), nameIReg64orZR(mm));
      return True;
   }

/* -------------------- M{ADD,SUB} -------------------- */   
   /* 31 30           20 15 14 9 4
      sf 00 11011 000 m  0  a  n r   MADD Rd,Rn,Rm,Ra  d = a+m*n
      sf 00 11011 000 m  1  a  n r   MADD Rd,Rn,Rm,Ra  d = a-m*n
   */
   if (INSN(30,21) == BITS10(0,0,1,1,0,1,1,0,0,0)) {
      Bool is64  = INSN(31,31) == 1;
      UInt mm    = INSN(20,16);
      Bool isAdd = INSN(15,15) == 0;
      UInt aa    = INSN(14,10);
      UInt nn    = INSN(9,5);
      UInt dd    = INSN(4,0);
      if (is64) {
         putIReg64orZR(
            dd,
            binop(isAdd ? Iop_Add64 : Iop_Sub64,
                  getIReg64orZR(aa),
                  binop(Iop_Mul64, getIReg64orZR(mm), getIReg64orZR(nn))));
      } else {
         putIReg32orZR(
            dd,
            binop(isAdd ? Iop_Add32 : Iop_Sub32,
                  getIReg32orZR(aa),
                  binop(Iop_Mul32, getIReg32orZR(mm), getIReg32orZR(nn))));
      }
      DIP("%s %s, %s, %s, %s\n",
          isAdd ? "madd" : "msub",
          nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
          nameIRegOrZR(is64, mm), nameIRegOrZR(is64, aa));
      return True;
   }

/* ---------------- CS{EL,INC,INV,NEG} ---------------- */   
   /* 31 30 28        20 15   11 9  4
      sf 00 1101 0100 mm cond 00 nn dd   CSEL  Rd,Rn,Rm
      sf 00 1101 0100 mm cond 01 nn dd   CSINC Rd,Rn,Rm
      sf 10 1101 0100 mm cond 00 nn dd   CSINV Rd,Rn,Rm
      sf 10 1101 0100 mm cond 01 nn dd   CSNEG Rd,Rn,Rm
      In all cases, the operation is: Rd = if cond then Rn else OP(Rm)
   */
   if (INSN(29,21) == BITS9(0, 1,1,0,1, 0,1,0,0) && INSN(11,11) == 0) {
      Bool    is64 = INSN(31,31) == 1;
      UInt    b30  = INSN(30,30);
      UInt    mm   = INSN(20,16);
      UInt    cond = INSN(15,12);
      UInt    b10  = INSN(10,10);
      UInt    nn   = INSN(9,5);
      UInt    dd   = INSN(4,0);
      UInt    op   = (b30 << 1) | b10; /* 00=id 01=inc 10=inv 11=neg */
      IRType  ty   = is64 ? Ity_I64 : Ity_I32;
      IRExpr* argL = getIRegOrZR(is64, nn);
      IRExpr* argR = getIRegOrZR(is64, mm);
      switch (op) {
         case BITS2(0,0):
            break;
         case BITS2(0,1):
            argR = binop(mkADD(ty), argR, mkU(ty,1));
            break;
         case BITS2(1,0):
            argR = unop(mkNOT(ty), argR);
            break;
         case BITS2(1,1):
            argR = binop(mkSUB(ty), mkU(ty,0), argR);
            break;
         default:
            vassert(0);
      }
      putIRegOrZR(
         is64, dd,
         IRExpr_ITE(unop(Iop_64to1, mk_arm64g_calculate_condition(cond)),
                    argL, argR)
      );
      const HChar* op_nm[4] = { "csel", "csinc", "csinv", "csneg" };
      DIP("%s %s, %s, %s, %s\n", op_nm[op],
          nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn),
          nameIRegOrZR(is64, mm), nameCC(cond));
      return True;
   }

/* -------------- ADD/SUB(extended reg) -------------- */   
   /*     28         20 15  12   9 4
      000 01011 00 1 m  opt imm3 n d   ADD  Wd|SP, Wn|SP, Wm ext&lsld
      100 01011 00 1 m  opt imm3 n d   ADD  Xd|SP, Xn|SP, Rm ext&lsld

001 01011 00 1 m  opt imm3 n d   ADDS Wd,    Wn|SP, Wm ext&lsld
      101 01011 00 1 m  opt imm3 n d   ADDS Xd,    Xn|SP, Rm ext&lsld

010 01011 00 1 m  opt imm3 n d   SUB  Wd|SP, Wn|SP, Wm ext&lsld
      110 01011 00 1 m  opt imm3 n d   SUB  Xd|SP, Xn|SP, Rm ext&lsld

011 01011 00 1 m  opt imm3 n d   SUBS Wd,    Wn|SP, Wm ext&lsld
      111 01011 00 1 m  opt imm3 n d   SUBS Xd,    Xn|SP, Rm ext&lsld

The 'm' operand is extended per opt, thusly:

000   Xm & 0xFF           UXTB
        001   Xm & 0xFFFF         UXTH
        010   Xm & (2^32)-1       UXTW
        011   Xm                  UXTX

100   Xm sx from bit 7    SXTB
        101   Xm sx from bit 15   SXTH
        110   Xm sx from bit 31   SXTW
        111   Xm                  SXTX

In the 64 bit case (bit31 == 1), UXTX and SXTX are the identity
      operation on Xm.  In the 32 bit case, UXTW, UXTX, SXTW and SXTX
      are the identity operation on Wm.

After extension, the value is shifted left by imm3 bits, which
      may only be in the range 0 .. 4 inclusive.
   */
   if (INSN(28,21) == BITS8(0,1,0,1,1,0,0,1) && INSN(12,10) <= 4) {
      Bool is64  = INSN(31,31) == 1;
      Bool isSub = INSN(30,30) == 1;
      Bool setCC = INSN(29,29) == 1;
      UInt mm    = INSN(20,16);
      UInt opt   = INSN(15,13);
      UInt imm3  = INSN(12,10);
      UInt nn    = INSN(9,5);
      UInt dd    = INSN(4,0);
      const HChar* nameExt[8] = { "uxtb", "uxth", "uxtw", "uxtx",
                                  "sxtb", "sxth", "sxtw", "sxtx" };
      /* Do almost the same thing in the 32- and 64-bit cases. */
      IRTemp xN = newTemp(Ity_I64);
      IRTemp xM = newTemp(Ity_I64);
      assign(xN, getIReg64orSP(nn));
      assign(xM, getIReg64orZR(mm));
      IRExpr* xMw  = mkexpr(xM); /* "xM widened" */
      Int     shSX = 0;
      /* widen Xm .. */
      switch (opt) {
         case BITS3(0,0,0): // UXTB
            xMw = binop(Iop_And64, xMw, mkU64(0xFF)); break;
         case BITS3(0,0,1): // UXTH
            xMw = binop(Iop_And64, xMw, mkU64(0xFFFF)); break;
         case BITS3(0,1,0): // UXTW -- noop for the 32bit case
            if (is64) {
               xMw = unop(Iop_32Uto64, unop(Iop_64to32, xMw));
            }
            break;
         case BITS3(0,1,1): // UXTX -- always a noop
            break;
         case BITS3(1,0,0): // SXTB
            shSX = 56; goto sxTo64;
         case BITS3(1,0,1): // SXTH
            shSX = 48; goto sxTo64;
         case BITS3(1,1,0): // SXTW -- noop for the 32bit case
            if (is64) {
               shSX = 32; goto sxTo64;
            }
            break;
         case BITS3(1,1,1): // SXTX -- always a noop
            break;
         sxTo64:
            vassert(shSX >= 32);
            xMw = binop(Iop_Sar64, binop(Iop_Shl64, xMw, mkU8(shSX)),
                        mkU8(shSX));
            break;
         default:
            vassert(0);
      }
      /* and now shift */
      IRTemp argL = xN;
      IRTemp argR = newTemp(Ity_I64);
      assign(argR, binop(Iop_Shl64, xMw, mkU8(imm3)));
      IRTemp res = newTemp(Ity_I64);
      assign(res, binop(isSub ? Iop_Sub64 : Iop_Add64,
                        mkexpr(argL), mkexpr(argR)));
      if (is64) {
         if (setCC) {
            putIReg64orZR(dd, mkexpr(res));
            setFlags_ADD_SUB(True/*is64*/, isSub, argL, argR);
         } else {
            putIReg64orSP(dd, mkexpr(res));
         }
      } else {
         if (setCC) {
            IRTemp argL32 = newTemp(Ity_I32);
            IRTemp argR32 = newTemp(Ity_I32);
            putIReg32orZR(dd, unop(Iop_64to32, mkexpr(res)));
            assign(argL32, unop(Iop_64to32, mkexpr(argL)));
            assign(argR32, unop(Iop_64to32, mkexpr(argR)));
            setFlags_ADD_SUB(False/*!is64*/, isSub, argL32, argR32);
         } else {
            putIReg32orSP(dd, unop(Iop_64to32, mkexpr(res)));
         }
      }
      DIP("%s%s %s, %s, %s %s lsl %u\n",
          isSub ? "sub" : "add", setCC ? "s" : "",
          setCC ? nameIRegOrZR(is64, dd) : nameIRegOrSP(is64, dd),
          nameIRegOrSP(is64, nn), nameIRegOrSP(is64, mm),
          nameExt[opt], imm3);
      return True;
   }

/* ---------------- CCMP/CCMN(imm) ---------------- */
   /* Bizarrely, these appear in the "data processing register"
      category, even though they are operations against an
      immediate. */
   /* 31   29        20   15   11 9    3
      sf 1 111010010 imm5 cond 10 Rn 0 nzcv   CCMP Rn, #imm5, #nzcv, cond
      sf 0 111010010 imm5 cond 10 Rn 0 nzcv   CCMN Rn, #imm5, #nzcv, cond

Operation is:
         (CCMP) flags = if cond then flags-after-sub(Rn,imm5) else nzcv
         (CCMN) flags = if cond then flags-after-add(Rn,imm5) else nzcv
   */
   if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
       && INSN(11,10) == BITS2(1,0) && INSN(4,4) == 0) {
      Bool is64  = INSN(31,31) == 1;
      Bool isSUB = INSN(30,30) == 1;
      UInt imm5  = INSN(20,16);
      UInt cond  = INSN(15,12);
      UInt nn    = INSN(9,5);
      UInt nzcv  = INSN(3,0);

IRTemp condT = newTemp(Ity_I1);
      assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));

IRType ty   = is64 ? Ity_I64 : Ity_I32;
      IRTemp argL = newTemp(ty);
      IRTemp argR = newTemp(ty);

if (is64) {
         assign(argL, getIReg64orZR(nn));
         assign(argR, mkU64(imm5));
      } else {
         assign(argL, getIReg32orZR(nn));
         assign(argR, mkU32(imm5));
      }
      setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);

DIP("ccm%c %s, #%u, #%u, %s\n",
          isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
          imm5, nzcv, nameCC(cond));
      return True;
   }

/* ---------------- CCMP/CCMN(reg) ---------------- */
   /* 31   29        20 15   11 9    3
      sf 1 111010010 Rm cond 00 Rn 0 nzcv   CCMP Rn, Rm, #nzcv, cond
      sf 0 111010010 Rm cond 00 Rn 0 nzcv   CCMN Rn, Rm, #nzcv, cond
      Operation is:
         (CCMP) flags = if cond then flags-after-sub(Rn,Rm) else nzcv
         (CCMN) flags = if cond then flags-after-add(Rn,Rm) else nzcv
   */
   if (INSN(29,21) == BITS9(1,1,1,0,1,0,0,1,0)
       && INSN(11,10) == BITS2(0,0) && INSN(4,4) == 0) {
      Bool is64  = INSN(31,31) == 1;
      Bool isSUB = INSN(30,30) == 1;
      UInt mm    = INSN(20,16);
      UInt cond  = INSN(15,12);
      UInt nn    = INSN(9,5);
      UInt nzcv  = INSN(3,0);

IRTemp condT = newTemp(Ity_I1);
      assign(condT, unop(Iop_64to1, mk_arm64g_calculate_condition(cond)));

IRType ty   = is64 ? Ity_I64 : Ity_I32;
      IRTemp argL = newTemp(ty);
      IRTemp argR = newTemp(ty);

if (is64) {
         assign(argL, getIReg64orZR(nn));
         assign(argR, getIReg64orZR(mm));
      } else {
         assign(argL, getIReg32orZR(nn));
         assign(argR, getIReg32orZR(mm));
      }
      setFlags_ADD_SUB_conditionally(is64, isSUB, condT, argL, argR, nzcv);

DIP("ccm%c %s, %s, #%u, %s\n",
          isSUB ? 'p' : 'n', nameIRegOrZR(is64, nn),
          nameIRegOrZR(is64, mm), nzcv, nameCC(cond));
      return True;
   }

/* -------------- REV/REV16/REV32/RBIT -------------- */   
   /* 31 30 28       20    15   11 9 4

1  10 11010110 00000 0000 11 n d    (1) REV   Xd, Xn
      0  10 11010110 00000 0000 10 n d    (2) REV   Wd, Wn

1  10 11010110 00000 0000 00 n d    (3) RBIT  Xd, Xn
      0  10 11010110 00000 0000 00 n d    (4) RBIT  Wd, Wn

1  10 11010110 00000 0000 01 n d    (5) REV16 Xd, Xn
      0  10 11010110 00000 0000 01 n d    (6) REV16 Wd, Wn

1  10 11010110 00000 0000 10 n d    (7) REV32 Xd, Xn
   */
   if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
       && INSN(20,12) == BITS9(0,0,0,0,0,0,0,0,0)) {
      UInt b31 = INSN(31,31);
      UInt opc = INSN(11,10);

UInt ix = 0;
      /**/ if (b31 == 1 && opc == BITS2(1,1)) ix = 1; 
      else if (b31 == 0 && opc == BITS2(1,0)) ix = 2; 
      else if (b31 == 1 && opc == BITS2(0,0)) ix = 3; 
      else if (b31 == 0 && opc == BITS2(0,0)) ix = 4; 
      else if (b31 == 1 && opc == BITS2(0,1)) ix = 5; 
      else if (b31 == 0 && opc == BITS2(0,1)) ix = 6; 
      else if (b31 == 1 && opc == BITS2(1,0)) ix = 7; 
      if (ix >= 1 && ix <= 7) {
         Bool   is64  = ix == 1 || ix == 3 || ix == 5 || ix == 7;
         UInt   nn    = INSN(9,5);
         UInt   dd    = INSN(4,0);
         IRTemp src   = newTemp(Ity_I64);
         IRTemp dst   = IRTemp_INVALID;
         IRTemp (*math)(IRTemp) = NULL;
         switch (ix) {
            case 1: case 2: math = math_BYTESWAP64;   break;
            case 3: case 4: math = math_BITSWAP64;    break;
            case 5: case 6: math = math_USHORTSWAP64; break;
            case 7:         math = math_UINTSWAP64;   break;
            default: vassert(0);
         }
         const HChar* names[7]
           = { "rev", "rev", "rbit", "rbit", "rev16", "rev16", "rev32" };
         const HChar* nm = names[ix-1];
         vassert(math);
         if (ix == 6) {
            /* This has to be special cased, since the logic below doesn't
               handle it correctly. */
            assign(src, getIReg64orZR(nn));
            dst = math(src);
            putIReg64orZR(dd,
                          unop(Iop_32Uto64, unop(Iop_64to32, mkexpr(dst))));
         } else if (is64) {
            assign(src, getIReg64orZR(nn));
            dst = math(src);
            putIReg64orZR(dd, mkexpr(dst));
         } else {
            assign(src, binop(Iop_Shl64, getIReg64orZR(nn), mkU8(32)));
            dst = math(src);
            putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
         }
         DIP("%s %s, %s\n", nm,
             nameIRegOrZR(is64,dd), nameIRegOrZR(is64,nn));
         return True;
      }
      /* else fall through */
   }

/* -------------------- CLZ/CLS -------------------- */   
   /*    30 28   24   20    15      9 4
      sf 10 1101 0110 00000 00010 0 n d    CLZ Rd, Rn
      sf 10 1101 0110 00000 00010 1 n d    CLS Rd, Rn
   */
   if (INSN(30,21) == BITS10(1,0,1,1,0,1,0,1,1,0)
       && INSN(20,11) == BITS10(0,0,0,0,0,0,0,0,1,0)) {
      Bool   is64  = INSN(31,31) == 1;
      Bool   isCLS = INSN(10,10) == 1;
      UInt   nn    = INSN(9,5);
      UInt   dd    = INSN(4,0);
      IRTemp src   = newTemp(Ity_I64);
      IRTemp srcZ  = newTemp(Ity_I64);
      IRTemp dst   = newTemp(Ity_I64);
      /* Get the argument, widened out to 64 bit */
      if (is64) {
         assign(src, getIReg64orZR(nn));
      } else {
         assign(src, binop(Iop_Shl64,
                           unop(Iop_32Uto64, getIReg32orZR(nn)), mkU8(32)));
      }
      /* If this is CLS, mash the arg around accordingly */
      if (isCLS) {
         IRExpr* one = mkU8(1);
         assign(srcZ,
         binop(Iop_Xor64,
               binop(Iop_Shl64, mkexpr(src), one),
               binop(Iop_Shl64, binop(Iop_Shr64, mkexpr(src), one), one)));
      } else {
         assign(srcZ, mkexpr(src));
      }
      /* And compute CLZ. */
      if (is64) {
         assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
                                mkU64(isCLS ? 63 : 64),
                                unop(Iop_Clz64, mkexpr(srcZ))));
         putIReg64orZR(dd, mkexpr(dst));
      } else {
         assign(dst, IRExpr_ITE(binop(Iop_CmpEQ64, mkexpr(srcZ), mkU64(0)),
                                mkU64(isCLS ? 31 : 32),
                                unop(Iop_Clz64, mkexpr(srcZ))));
         putIReg32orZR(dd, unop(Iop_64to32, mkexpr(dst)));
      }
      DIP("cl%c %s, %s\n", isCLS ? 's' : 'z',
          nameIRegOrZR(is64, dd), nameIRegOrZR(is64, nn));
      return True;
   }

/* ------------------ LSLV/LSRV/ASRV/RORV ------------------ */   
   /*    30 28        20 15   11 9 4
      sf 00 1101 0110 m  0010 00 n d   LSLV Rd,Rn,Rm
      sf 00 1101 0110 m  0010 01 n d   LSRV Rd,Rn,Rm
      sf 00 1101 0110 m  0010 10 n d   ASRV Rd,Rn,Rm
      sf 00 1101 0110 m  0010 11 n d   RORV Rd,Rn,Rm
   */
   if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
       && INSN(15,12) == BITS4(0,0,1,0)) {
      Bool   is64 = INSN(31,31) == 1;
      UInt   mm   = INSN(20,16);
      UInt   op   = INSN(11,10);
      UInt   nn   = INSN(9,5);
      UInt   dd   = INSN(4,0);
      IRType ty   = is64 ? Ity_I64 : Ity_I32;
      IRTemp srcL = newTemp(ty);
      IRTemp srcR = newTemp(Ity_I64);
      IRTemp res  = newTemp(ty);
      IROp   iop  = Iop_INVALID;
      assign(srcL, getIRegOrZR(is64, nn));
      assign(srcR, binop(Iop_And64, getIReg64orZR(mm),
                                    mkU64(is64 ? 63 : 31)));
      if (op < 3) {
         // LSLV, LSRV, ASRV
         switch (op) {
            case BITS2(0,0): iop = mkSHL(ty); break;
            case BITS2(0,1): iop = mkSHR(ty); break;
            case BITS2(1,0): iop = mkSAR(ty); break;
            default: vassert(0);
         }
         assign(res, binop(iop, mkexpr(srcL),
                                unop(Iop_64to8, mkexpr(srcR))));
      } else {
         // RORV
         IROp opSHL = mkSHL(ty);
         IROp opSHR = mkSHR(ty);
         IROp opOR  = mkOR(ty);
         IRExpr* width = mkU64(is64 ? 64: 32);
         assign(
            res,
            IRExpr_ITE(
               binop(Iop_CmpEQ64, mkexpr(srcR), mkU64(0)),
               mkexpr(srcL),
               binop(opOR,
                     binop(opSHL,
                           mkexpr(srcL),
                           unop(Iop_64to8, binop(Iop_Sub64, width,
                                                            mkexpr(srcR)))),
                     binop(opSHR,
                           mkexpr(srcL), unop(Iop_64to8, mkexpr(srcR))))
         ));
      }
      putIRegOrZR(is64, dd, mkexpr(res));
      vassert(op < 4);
      const HChar* names[4] = { "lslv", "lsrv", "asrv", "rorv" };
      DIP("%s %s, %s, %s\n",
          names[op], nameIRegOrZR(is64,dd),
                     nameIRegOrZR(is64,nn), nameIRegOrZR(is64,mm));
      return True;
   }

/* -------------------- SDIV/UDIV -------------------- */   
   /*    30 28        20 15    10 9 4
      sf 00 1101 0110 m  00001  1 n d  SDIV Rd,Rn,Rm
      sf 00 1101 0110 m  00001  0 n d  UDIV Rd,Rn,Rm
   */
   if (INSN(30,21) == BITS10(0,0,1,1,0,1,0,1,1,0)
       && INSN(15,11) == BITS5(0,0,0,0,1)) {
      Bool is64 = INSN(31,31) == 1;
      UInt mm   = INSN(20,16);
      Bool isS  = INSN(10,10) == 1;
      UInt nn   = INSN(9,5);
      UInt dd   = INSN(4,0);
      if (isS) {
         putIRegOrZR(is64, dd, binop(is64 ? Iop_DivS64 : Iop_DivS32,
                                     getIRegOrZR(is64, nn),
                                     getIRegOrZR(is64, mm)));
      } else {
         putIRegOrZR(is64, dd, binop(is64 ? Iop_DivU64 : Iop_DivU32,
                                     getIRegOrZR(is64, nn),
                                     getIRegOrZR(is64, mm)));
      }
      DIP("%cdiv %s, %s, %s\n", isS ? 's' : 'u',
          nameIRegOrZR(is64, dd),
          nameIRegOrZR(is64, nn), nameIRegOrZR(is64, mm));
      return True;
   }

/* ------------------ {S,U}M{ADD,SUB}L ------------------ */   
   /* 31        23  20 15 14 9 4
      1001 1011 101 m  0  a  n d   UMADDL Xd,Wn,Wm,Xa
      1001 1011 001 m  0  a  n d   SMADDL Xd,Wn,Wm,Xa
      1001 1011 101 m  1  a  n d   UMSUBL Xd,Wn,Wm,Xa
      1001 1011 001 m  1  a  n d   SMSUBL Xd,Wn,Wm,Xa
      with operation
         Xd = Xa +/- (Wn *u/s Wm)
   */
   if (INSN(31,24) == BITS8(1,0,0,1,1,0,1,1) && INSN(22,21) == BITS2(0,1)) {
      Bool   isU   = INSN(23,23) == 1;
      UInt   mm    = INSN(20,16);
      Bool   isAdd = INSN(15,15) == 0;
      UInt   aa    = INSN(14,10);
      UInt   nn    = INSN(9,5);
      UInt   dd    = INSN(4,0);
      IRTemp wN    = newTemp(Ity_I32);
      IRTemp wM    = newTemp(Ity_I32);
      IRTemp xA    = newTemp(Ity_I64);
      IRTemp muld  = newTemp(Ity_I64);
      IRTemp res   = newTemp(Ity_I64);
      assign(wN, getIReg32orZR(nn));
      assign(wM, getIReg32orZR(mm));
      assign(xA, getIReg64orZR(aa));
      assign(muld, binop(isU ? Iop_MullU32 : Iop_MullS32,
                         mkexpr(wN), mkexpr(wM)));
      assign(res, binop(isAdd ? Iop_Add64 : Iop_Sub64,
                        mkexpr(xA), mkexpr(muld)));
      putIReg64orZR(dd, mkexpr(res));
      DIP("%cm%sl %s, %s, %s, %s\n", isU ? 'u' : 's', isAdd ? "add" : "sub",
          nameIReg64orZR(dd), nameIReg32orZR(nn),
          nameIReg32orZR(mm), nameIReg64orZR(aa));
      return True;
   }
   vex_printf("ARM64 front end: data_processing_register\n");
   return False;
#  undef INSN
}

/*------------------------------------------------------------*/
/*--- Math helpers for vector interleave/deinterleave      ---*/
/*------------------------------------------------------------*/

#define EX(_tmp) \
           mkexpr(_tmp)
#define SL(_hi128,_lo128,_nbytes) \
           ( (_nbytes) == 0 \
                ? (_lo128) \
                : triop(Iop_SliceV128,(_hi128),(_lo128),mkU8(_nbytes)) )
#define ROR(_v128,_nbytes) \
           SL((_v128),(_v128),(_nbytes))
#define ROL(_v128,_nbytes) \
           SL((_v128),(_v128),16-(_nbytes))
#define SHR(_v128,_nbytes) \
           binop(Iop_ShrV128,(_v128),mkU8(8*(_nbytes)))
#define SHL(_v128,_nbytes) \
           binop(Iop_ShlV128,(_v128),mkU8(8*(_nbytes)))
#define ILO64x2(_argL,_argR) \
           binop(Iop_InterleaveLO64x2,(_argL),(_argR))
#define IHI64x2(_argL,_argR) \
           binop(Iop_InterleaveHI64x2,(_argL),(_argR))
#define ILO32x4(_argL,_argR) \
           binop(Iop_InterleaveLO32x4,(_argL),(_argR))
#define IHI32x4(_argL,_argR) \
           binop(Iop_InterleaveHI32x4,(_argL),(_argR))
#define ILO16x8(_argL,_argR) \
           binop(Iop_InterleaveLO16x8,(_argL),(_argR))
#define IHI16x8(_argL,_argR) \
           binop(Iop_InterleaveHI16x8,(_argL),(_argR))
#define ILO8x16(_argL,_argR) \
           binop(Iop_InterleaveLO8x16,(_argL),(_argR))
#define IHI8x16(_argL,_argR) \
           binop(Iop_InterleaveHI8x16,(_argL),(_argR))
#define CEV32x4(_argL,_argR) \
           binop(Iop_CatEvenLanes32x4,(_argL),(_argR))
#define COD32x4(_argL,_argR) \
           binop(Iop_CatOddLanes32x4,(_argL),(_argR))
#define COD16x8(_argL,_argR) \
           binop(Iop_CatOddLanes16x8,(_argL),(_argR))
#define COD8x16(_argL,_argR) \
           binop(Iop_CatOddLanes8x16,(_argL),(_argR))
#define CEV8x16(_argL,_argR) \
           binop(Iop_CatEvenLanes8x16,(_argL),(_argR))
#define AND(_arg1,_arg2) \
           binop(Iop_AndV128,(_arg1),(_arg2))
#define OR2(_arg1,_arg2) \
           binop(Iop_OrV128,(_arg1),(_arg2))
#define OR3(_arg1,_arg2,_arg3) \
           binop(Iop_OrV128,(_arg1),binop(Iop_OrV128,(_arg2),(_arg3)))
#define OR4(_arg1,_arg2,_arg3,_arg4) \
           binop(Iop_OrV128, \
                 binop(Iop_OrV128,(_arg1),(_arg2)), \
                 binop(Iop_OrV128,(_arg3),(_arg4)))

/* Do interleaving for 1 128 bit vector, for ST1 insns. */
static
void math_INTERLEAVE1_128( /*OUTx1*/ IRTemp* i0,
                           UInt laneSzBlg2, IRTemp u0 )
{
   assign(*i0, mkexpr(u0));
}

/* Do interleaving for 2 128 bit vectors, for ST2 insns. */
static
void math_INTERLEAVE2_128( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
                           UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
{
   /* This is pretty easy, since we have primitives directly to
      hand. */
   if (laneSzBlg2 == 3) {
      // 64x2
      // u1 == B1 B0, u0 == A1 A0
      // i1 == B1 A1, i0 == B0 A0
      assign(*i0, binop(Iop_InterleaveLO64x2, mkexpr(u1), mkexpr(u0)));
      assign(*i1, binop(Iop_InterleaveHI64x2, mkexpr(u1), mkexpr(u0)));
      return;
   }
   if (laneSzBlg2 == 2) {
      // 32x4
      // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0, 
      // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
      assign(*i0, binop(Iop_InterleaveLO32x4, mkexpr(u1), mkexpr(u0)));
      assign(*i1, binop(Iop_InterleaveHI32x4, mkexpr(u1), mkexpr(u0)));
      return;
   } 
   if (laneSzBlg2 == 1) {
      // 16x8
      // u1 == B{7..0}, u0 == A{7..0}
      // i0 == B3 A3 B2 A2 B1 A1 B0 A0
      // i1 == B7 A7 B6 A6 B5 A5 B4 A4
      assign(*i0, binop(Iop_InterleaveLO16x8, mkexpr(u1), mkexpr(u0)));
      assign(*i1, binop(Iop_InterleaveHI16x8, mkexpr(u1), mkexpr(u0)));
      return;
   }
   if (laneSzBlg2 == 0) {
      // 8x16
      // u1 == B{f..0}, u0 == A{f..0}
      // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
      // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
      assign(*i0, binop(Iop_InterleaveLO8x16, mkexpr(u1), mkexpr(u0)));
      assign(*i1, binop(Iop_InterleaveHI8x16, mkexpr(u1), mkexpr(u0)));
      return;
   }
   /*NOTREACHED*/
   vassert(0);
}

/* Do interleaving for 3 128 bit vectors, for ST3 insns. */
static
void math_INTERLEAVE3_128( 
        /*OUTx3*/ IRTemp* i0, IRTemp* i1, IRTemp* i2,
        UInt laneSzBlg2,
        IRTemp u0, IRTemp u1, IRTemp u2 )
{
   if (laneSzBlg2 == 3) {
      // 64x2
      // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
      // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
      assign(*i2, IHI64x2( EX(u2), EX(u1) ));
      assign(*i1, ILO64x2( ROR(EX(u0),8), EX(u2) ));
      assign(*i0, ILO64x2( EX(u1), EX(u0) ));
      return;
   }

if (laneSzBlg2 == 2) {
      // 32x4
      // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
      // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
      // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
      IRTemp p0    = newTempV128();
      IRTemp p1    = newTempV128();
      IRTemp p2    = newTempV128();
      IRTemp c1100 = newTempV128();
      IRTemp c0011 = newTempV128();
      IRTemp c0110 = newTempV128();
      assign(c1100, mkV128(0xFF00));
      assign(c0011, mkV128(0x00FF));
      assign(c0110, mkV128(0x0FF0));
      // First interleave them at 64x2 granularity,
      // generating partial ("p") values.
      math_INTERLEAVE3_128(&p0, &p1, &p2, 3, u0, u1, u2);
      // And more shuffling around for the final answer
      assign(*i2, OR2( AND( IHI32x4(EX(p2), ROL(EX(p2),8)), EX(c1100) ),
                       AND( IHI32x4(ROR(EX(p1),4), EX(p2)), EX(c0011) ) ));
      assign(*i1, OR3( SHL(EX(p2),12),
                       AND(EX(p1),EX(c0110)),
                       SHR(EX(p0),12) ));
      assign(*i0, OR2( AND( ILO32x4(EX(p0),ROL(EX(p1),4)), EX(c1100) ),
                       AND( ILO32x4(ROR(EX(p0),8),EX(p0)), EX(c0011) ) ));
      return;
   }

if (laneSzBlg2 == 1) {
      // 16x8
      // u2 == C7 C6 C5 C4 C3 C2 C1 C0
      // u1 == B7 B6 B5 B4 B3 B2 B1 B0
      // u0 == A7 A6 A5 A4 A3 A2 A1 A0
      //
      // p2 == C7 C6 B7 B6 A7 A6 C5 C4
      // p1 == B5 B4 A5 A4 C3 C2 B3 B2
      // p0 == A3 A2 C1 C0 B1 B0 A1 A0
      //
      // i2 == C7 B7 A7 C6 B6 A6 C5 B5
      // i1 == A5 C4 B4 A4 C4 B3 A3 C2
      // i0 == B2 A2 C1 B1 A1 C0 B0 A0
      IRTemp p0    = newTempV128();
      IRTemp p1    = newTempV128();
      IRTemp p2    = newTempV128();
      IRTemp c1000 = newTempV128();
      IRTemp c0100 = newTempV128();
      IRTemp c0010 = newTempV128();
      IRTemp c0001 = newTempV128();
      assign(c1000, mkV128(0xF000));
      assign(c0100, mkV128(0x0F00));
      assign(c0010, mkV128(0x00F0));
      assign(c0001, mkV128(0x000F));
      // First interleave them at 32x4 granularity,
      // generating partial ("p") values.
      math_INTERLEAVE3_128(&p0, &p1, &p2, 2, u0, u1, u2);
      // And more shuffling around for the final answer
      assign(*i2,
             OR4( AND( IHI16x8( EX(p2),        ROL(EX(p2),4) ), EX(c1000) ),
                  AND( IHI16x8( ROL(EX(p2),6), EX(p2)        ), EX(c0100) ),
                  AND( IHI16x8( ROL(EX(p2),2), ROL(EX(p2),6) ), EX(c0010) ),
                  AND( ILO16x8( ROR(EX(p2),2), ROL(EX(p1),2) ), EX(c0001) )
      ));
      assign(*i1,
             OR4( AND( IHI16x8( ROL(EX(p1),4), ROR(EX(p2),2) ), EX(c1000) ),
                  AND( IHI16x8( EX(p1),        ROL(EX(p1),4) ), EX(c0100) ),
                  AND( IHI16x8( ROL(EX(p1),4), ROL(EX(p1),8) ), EX(c0010) ),
                  AND( IHI16x8( ROR(EX(p0),6), ROL(EX(p1),4) ), EX(c0001) )
      ));
      assign(*i0,
             OR4( AND( IHI16x8( ROR(EX(p1),2), ROL(EX(p0),2) ), EX(c1000) ),
                  AND( IHI16x8( ROL(EX(p0),2), ROL(EX(p0),6) ), EX(c0100) ),
                  AND( IHI16x8( ROL(EX(p0),8), ROL(EX(p0),2) ), EX(c0010) ),
                  AND( IHI16x8( ROL(EX(p0),4), ROL(EX(p0),8) ), EX(c0001) )
      ));
      return;
   }

if (laneSzBlg2 == 0) {
      // 8x16.  It doesn't seem worth the hassle of first doing a
      // 16x8 interleave, so just generate all 24 partial results
      // directly :-(
      // u2 == Cf .. C0, u1 == Bf .. B0, u0 == Af .. A0
      // i2 == Cf Bf Af Ce .. Bb Ab Ca
      // i1 == Ba Aa C9 B9 .. A6 C5 B5
      // i0 == A5 C4 B4 A4 .. C0 B0 A0

IRTemp i2_FEDC = newTempV128(); IRTemp i2_BA98 = newTempV128();
      IRTemp i2_7654 = newTempV128(); IRTemp i2_3210 = newTempV128();
      IRTemp i1_FEDC = newTempV128(); IRTemp i1_BA98 = newTempV128();
      IRTemp i1_7654 = newTempV128(); IRTemp i1_3210 = newTempV128();
      IRTemp i0_FEDC = newTempV128(); IRTemp i0_BA98 = newTempV128();
      IRTemp i0_7654 = newTempV128(); IRTemp i0_3210 = newTempV128();
      IRTemp i2_hi64 = newTempV128(); IRTemp i2_lo64 = newTempV128();
      IRTemp i1_hi64 = newTempV128(); IRTemp i1_lo64 = newTempV128();
      IRTemp i0_hi64 = newTempV128(); IRTemp i0_lo64 = newTempV128();

// eg XXXX(qqq, CC, 0xF, BB, 0xA)) sets qqq to be a vector
      // of the form 14 bytes junk : CC[0xF] : BB[0xA]
      //
#     define XXXX(_tempName,_srcVec1,_srcShift1,_srcVec2,_srcShift2) \
         IRTemp t_##_tempName = newTempV128(); \
         assign(t_##_tempName, \
                ILO8x16( ROR(EX(_srcVec1),(_srcShift1)), \
                         ROR(EX(_srcVec2),(_srcShift2)) ) )

// Let CC, BB, AA be (handy) aliases of u2, u1, u0 respectively
      IRTemp CC = u2; IRTemp BB = u1; IRTemp AA = u0;

// The slicing and reassembly are done as interleavedly as possible,
      // so as to minimise the demand for registers in the back end, which
      // was observed to be a problem in testing.

XXXX(CfBf, CC, 0xf, BB, 0xf); // i2[15:14]
      XXXX(AfCe, AA, 0xf, CC, 0xe);
      assign(i2_FEDC, ILO16x8(EX(t_CfBf), EX(t_AfCe)));

XXXX(BeAe, BB, 0xe, AA, 0xe);
      XXXX(CdBd, CC, 0xd, BB, 0xd);
      assign(i2_BA98, ILO16x8(EX(t_BeAe), EX(t_CdBd)));
      assign(i2_hi64, ILO32x4(EX(i2_FEDC), EX(i2_BA98)));

XXXX(AdCc, AA, 0xd, CC, 0xc);
      XXXX(BcAc, BB, 0xc, AA, 0xc);
      assign(i2_7654, ILO16x8(EX(t_AdCc), EX(t_BcAc)));

XXXX(CbBb, CC, 0xb, BB, 0xb);
      XXXX(AbCa, AA, 0xb, CC, 0xa); // i2[1:0] 
      assign(i2_3210, ILO16x8(EX(t_CbBb), EX(t_AbCa)));
      assign(i2_lo64, ILO32x4(EX(i2_7654), EX(i2_3210)));
      assign(*i2, ILO64x2(EX(i2_hi64), EX(i2_lo64)));

XXXX(BaAa, BB, 0xa, AA, 0xa); // i1[15:14]
      XXXX(C9B9, CC, 0x9, BB, 0x9);
      assign(i1_FEDC, ILO16x8(EX(t_BaAa), EX(t_C9B9)));

XXXX(A9C8, AA, 0x9, CC, 0x8);
      XXXX(B8A8, BB, 0x8, AA, 0x8);
      assign(i1_BA98, ILO16x8(EX(t_A9C8), EX(t_B8A8)));
      assign(i1_hi64, ILO32x4(EX(i1_FEDC), EX(i1_BA98)));

XXXX(C7B7, CC, 0x7, BB, 0x7);
      XXXX(A7C6, AA, 0x7, CC, 0x6);
      assign(i1_7654, ILO16x8(EX(t_C7B7), EX(t_A7C6)));

XXXX(B6A6, BB, 0x6, AA, 0x6);
      XXXX(C5B5, CC, 0x5, BB, 0x5); // i1[1:0]
      assign(i1_3210, ILO16x8(EX(t_B6A6), EX(t_C5B5)));
      assign(i1_lo64, ILO32x4(EX(i1_7654), EX(i1_3210)));
      assign(*i1, ILO64x2(EX(i1_hi64), EX(i1_lo64)));

XXXX(A5C4, AA, 0x5, CC, 0x4); // i0[15:14]
      XXXX(B4A4, BB, 0x4, AA, 0x4);
      assign(i0_FEDC, ILO16x8(EX(t_A5C4), EX(t_B4A4)));

XXXX(C3B3, CC, 0x3, BB, 0x3);
      XXXX(A3C2, AA, 0x3, CC, 0x2);
      assign(i0_BA98, ILO16x8(EX(t_C3B3), EX(t_A3C2)));
      assign(i0_hi64, ILO32x4(EX(i0_FEDC), EX(i0_BA98)));

XXXX(B2A2, BB, 0x2, AA, 0x2);
      XXXX(C1B1, CC, 0x1, BB, 0x1);
      assign(i0_7654, ILO16x8(EX(t_B2A2), EX(t_C1B1)));

XXXX(A1C0, AA, 0x1, CC, 0x0);
      XXXX(B0A0, BB, 0x0, AA, 0x0); // i0[1:0]
      assign(i0_3210, ILO16x8(EX(t_A1C0), EX(t_B0A0)));
      assign(i0_lo64, ILO32x4(EX(i0_7654), EX(i0_3210)));
      assign(*i0, ILO64x2(EX(i0_hi64), EX(i0_lo64)));

#     undef XXXX
      return;
   }

/*NOTREACHED*/
   vassert(0);
}

/* Do interleaving for 4 128 bit vectors, for ST4 insns. */
static
void math_INTERLEAVE4_128( 
        /*OUTx4*/ IRTemp* i0, IRTemp* i1, IRTemp* i2, IRTemp* i3,
        UInt laneSzBlg2,
        IRTemp u0, IRTemp u1, IRTemp u2, IRTemp u3 )
{
   if (laneSzBlg2 == 3) {
      // 64x2
      assign(*i0, ILO64x2(EX(u1), EX(u0)));
      assign(*i1, ILO64x2(EX(u3), EX(u2)));
      assign(*i2, IHI64x2(EX(u1), EX(u0)));
      assign(*i3, IHI64x2(EX(u3), EX(u2)));
      return;
   }
   if (laneSzBlg2 == 2) {
      // 32x4
      // First, interleave at the 64-bit lane size.
      IRTemp p0 = newTempV128();
      IRTemp p1 = newTempV128();
      IRTemp p2 = newTempV128();
      IRTemp p3 = newTempV128();
      math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 3, u0, u1, u2, u3);
      // And interleave (cat) at the 32 bit size.
      assign(*i0, CEV32x4(EX(p1), EX(p0)));
      assign(*i1, COD32x4(EX(p1), EX(p0)));
      assign(*i2, CEV32x4(EX(p3), EX(p2)));
      assign(*i3, COD32x4(EX(p3), EX(p2)));
      return;
   }
   if (laneSzBlg2 == 1) {
      // 16x8
      // First, interleave at the 32-bit lane size.
      IRTemp p0 = newTempV128();
      IRTemp p1 = newTempV128();
      IRTemp p2 = newTempV128();
      IRTemp p3 = newTempV128();
      math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 2, u0, u1, u2, u3);
      // And rearrange within each vector, to get the right 16 bit lanes.
      assign(*i0, COD16x8(EX(p0), SHL(EX(p0), 2)));
      assign(*i1, COD16x8(EX(p1), SHL(EX(p1), 2)));
      assign(*i2, COD16x8(EX(p2), SHL(EX(p2), 2)));
      assign(*i3, COD16x8(EX(p3), SHL(EX(p3), 2)));
      return;
   }
   if (laneSzBlg2 == 0) {
      // 8x16
      // First, interleave at the 16-bit lane size.
      IRTemp p0 = newTempV128();
      IRTemp p1 = newTempV128();
      IRTemp p2 = newTempV128();
      IRTemp p3 = newTempV128();
      math_INTERLEAVE4_128(&p0, &p1, &p2, &p3, 1, u0, u1, u2, u3);
      // And rearrange within each vector, to get the right 8 bit lanes.
      assign(*i0, IHI32x4(COD8x16(EX(p0),EX(p0)), CEV8x16(EX(p0),EX(p0))));
      assign(*i1, IHI32x4(COD8x16(EX(p1),EX(p1)), CEV8x16(EX(p1),EX(p1))));
      assign(*i2, IHI32x4(COD8x16(EX(p2),EX(p2)), CEV8x16(EX(p2),EX(p2))));
      assign(*i3, IHI32x4(COD8x16(EX(p3),EX(p3)), CEV8x16(EX(p3),EX(p3))));
      return;
   }
   /*NOTREACHED*/
   vassert(0);
}

/* Do deinterleaving for 1 128 bit vector, for LD1 insns. */
static
void math_DEINTERLEAVE1_128( /*OUTx1*/ IRTemp* u0,
                             UInt laneSzBlg2, IRTemp i0 )
{
   assign(*u0, mkexpr(i0));
}

/* Do deinterleaving for 2 128 bit vectors, for LD2 insns. */
static
void math_DEINTERLEAVE2_128( /*OUTx2*/ IRTemp* u0, IRTemp* u1,
                             UInt laneSzBlg2, IRTemp i0, IRTemp i1 )
{
   /* This is pretty easy, since we have primitives directly to
      hand. */
   if (laneSzBlg2 == 3) {
      // 64x2
      // i1 == B1 A1, i0 == B0 A0
      // u1 == B1 B0, u0 == A1 A0
      assign(*u0, binop(Iop_InterleaveLO64x2, mkexpr(i1), mkexpr(i0)));
      assign(*u1, binop(Iop_InterleaveHI64x2, mkexpr(i1), mkexpr(i0)));
      return;
   }
   if (laneSzBlg2 == 2) {
      // 32x4
      // i1 == B3 A3 B2 A2, i0 == B1 A1 B0 A0
      // u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0, 
      assign(*u0, binop(Iop_CatEvenLanes32x4, mkexpr(i1), mkexpr(i0)));
      assign(*u1, binop(Iop_CatOddLanes32x4, mkexpr(i1), mkexpr(i0)));
      return;
   }
   if (laneSzBlg2 == 1) {
      // 16x8
      // i0 == B3 A3 B2 A2 B1 A1 B0 A0
      // i1 == B7 A7 B6 A6 B5 A5 B4 A4
      // u1 == B{7..0}, u0 == A{7..0}
      assign(*u0, binop(Iop_CatEvenLanes16x8, mkexpr(i1), mkexpr(i0)));
      assign(*u1, binop(Iop_CatOddLanes16x8,  mkexpr(i1), mkexpr(i0)));
      return;
   }
   if (laneSzBlg2 == 0) {
      // 8x16
      // i0 == B7 A7 B6 A6 B5 A5 B4 A4 B3 A3 B2 A2 B1 A1 B0 A0
      // i1 == Bf Af Be Ae Bd Ad Bc Ac Bb Ab Ba Aa B9 A9 B8 A8
      // u1 == B{f..0}, u0 == A{f..0}
      assign(*u0, binop(Iop_CatEvenLanes8x16, mkexpr(i1), mkexpr(i0)));
      assign(*u1, binop(Iop_CatOddLanes8x16,  mkexpr(i1), mkexpr(i0)));
      return;
   }
   /*NOTREACHED*/
   vassert(0);
}

/* Do deinterleaving for 3 128 bit vectors, for LD3 insns. */
static
void math_DEINTERLEAVE3_128( 
        /*OUTx3*/ IRTemp* u0, IRTemp* u1, IRTemp* u2,
        UInt laneSzBlg2,
        IRTemp i0, IRTemp i1, IRTemp i2 )
{
   if (laneSzBlg2 == 3) {
      // 64x2
      // i2 == C1 B1, i1 == A1 C0, i0 == B0 A0,
      // u2 == C1 C0, u1 == B1 B0, u0 == A1 A0
      assign(*u2, ILO64x2( ROL(EX(i2),8), EX(i1)        ));
      assign(*u1, ILO64x2( EX(i2),        ROL(EX(i0),8) ));
      assign(*u0, ILO64x2( ROL(EX(i1),8), EX(i0)        ));
      return;
   }

if (laneSzBlg2 == 2) {
      // 32x4
      // i2 == C3 B3 A2 C2, i1 == B2 A2 C1 B1, i0 == A1 C0 B0 A0
      // p2 == C3 C2 B3 B2, p1 == A3 A2 C1 C0, p0 == B1 B0 A1 A0
      // u2 == C3 C2 C1 C0, u1 == B3 B2 B1 B0, u0 == A3 A2 A1 A0
      IRTemp t_a1c0b0a0 = newTempV128();
      IRTemp t_a2c1b1a1 = newTempV128();
      IRTemp t_a3c2b2a2 = newTempV128();
      IRTemp t_a0c3b3a3 = newTempV128();
      IRTemp p0 = newTempV128();
      IRTemp p1 = newTempV128();
      IRTemp p2 = newTempV128();
      // Compute some intermediate values.
      assign(t_a1c0b0a0, EX(i0));
      assign(t_a2c1b1a1, SL(EX(i1),EX(i0),3*4));
      assign(t_a3c2b2a2, SL(EX(i2),EX(i1),2*4));
      assign(t_a0c3b3a3, SL(EX(i0),EX(i2),1*4));
      // First deinterleave into lane-pairs
      assign(p0, ILO32x4(EX(t_a2c1b1a1),EX(t_a1c0b0a0)));
      assign(p1, ILO64x2(ILO32x4(EX(t_a0c3b3a3), EX(t_a3c2b2a2)),
                         IHI32x4(EX(t_a2c1b1a1), EX(t_a1c0b0a0))));
      assign(p2, ILO32x4(ROR(EX(t_a0c3b3a3),1*4), ROR(EX(t_a3c2b2a2),1*4)));
      // Then deinterleave at 64x2 granularity.
      math_DEINTERLEAVE3_128(u0, u1, u2, 3, p0, p1, p2);
      return;
   }

if (laneSzBlg2 == 1) {
      // 16x8
      // u2 == C7 C6 C5 C4 C3 C2 C1 C0
      // u1 == B7 B6 B5 B4 B3 B2 B1 B0
      // u0 == A7 A6 A5 A4 A3 A2 A1 A0
      //
      // i2 == C7 B7 A7 C6 B6 A6 C5 B5
      // i1 == A5 C4 B4 A4 C4 B3 A3 C2
      // i0 == B2 A2 C1 B1 A1 C0 B0 A0
      //
      // p2 == C7 C6 B7 B6 A7 A6 C5 C4
      // p1 == B5 B4 A5 A4 C3 C2 B3 B2
      // p0 == A3 A2 C1 C0 B1 B0 A1 A0

IRTemp s0, s1, s2, s3, t0, t1, t2, t3, p0, p1, p2, c00111111;
      s0 = s1 = s2 = s3
         = t0 = t1 = t2 = t3 = p0 = p1 = p2 = c00111111 = IRTemp_INVALID;
      newTempsV128_4(&s0, &s1, &s2, &s3);
      newTempsV128_4(&t0, &t1, &t2, &t3);
      newTempsV128_4(&p0, &p1, &p2, &c00111111);

// s0 == b2a2 c1b1a1 c0b0a0
      // s1 == b4a4 c3b3c3 c2b2a2
      // s2 == b6a6 c5b5a5 c4b4a4
      // s3 == b0a0 c7b7a7 c6b6a6
      assign(s0, EX(i0));
      assign(s1, SL(EX(i1),EX(i0),6*2));
      assign(s2, SL(EX(i2),EX(i1),4*2));
      assign(s3, SL(EX(i0),EX(i2),2*2));

// t0 == 0 0 c1c0 b1b0 a1a0
      // t1 == 0 0 c3c2 b3b2 a3a2
      // t2 == 0 0 c5c4 b5b4 a5a4
      // t3 == 0 0 c7c6 b7b6 a7a6
      assign(c00111111, mkV128(0x0FFF));
      assign(t0, AND( ILO16x8( ROR(EX(s0),3*2), EX(s0)), EX(c00111111)));
      assign(t1, AND( ILO16x8( ROR(EX(s1),3*2), EX(s1)), EX(c00111111)));
      assign(t2, AND( ILO16x8( ROR(EX(s2),3*2), EX(s2)), EX(c00111111)));
      assign(t3, AND( ILO16x8( ROR(EX(s3),3*2), EX(s3)), EX(c00111111)));

assign(p0, OR2(EX(t0),          SHL(EX(t1),6*2)));
      assign(p1, OR2(SHL(EX(t2),4*2), SHR(EX(t1),2*2)));
      assign(p2, OR2(SHL(EX(t3),2*2), SHR(EX(t2),4*2)));

// Then deinterleave at 32x4 granularity.
      math_DEINTERLEAVE3_128(u0, u1, u2, 2, p0, p1, p2);
      return;
   }

if (laneSzBlg2 == 0) {
      // 8x16.  This is the same scheme as for 16x8, with twice the
      // number of intermediate values.
      //
      // u2 == C{f..0}
      // u1 == B{f..0}
      // u0 == A{f..0}
      //
      // i2 == CBA{f} CBA{e} CBA{d} CBA{c} CBA{b} C{a}
      // i1 ==  BA{a} CBA{9} CBA{8} CBA{7} CBA{6} CB{5}
      // i0 ==   A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
      //
      // p2 == C{fe} B{fe} A{fe} C{dc} B{dc} A{dc} C{ba} B{ba}
      // p1 == A{ba} C{98} B{98} A{98} C{76} B{76} A{76} C{54}
      // p0 == B{54} A{54} C{32} B{32} A{32} C{10} B{10} A{10}
      //
      IRTemp s0, s1, s2, s3, s4, s5, s6, s7,
             t0, t1, t2, t3, t4, t5, t6, t7, p0, p1, p2, cMASK;
      s0 = s1 = s2 = s3 = s4 = s5 = s6 = s7
         = t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = p0 = p1 = p2 = cMASK
         = IRTemp_INVALID;
      newTempsV128_4(&s0, &s1, &s2, &s3);
      newTempsV128_4(&s4, &s5, &s6, &s7);
      newTempsV128_4(&t0, &t1, &t2, &t3);
      newTempsV128_4(&t4, &t5, &t6, &t7);
      newTempsV128_4(&p0, &p1, &p2, &cMASK);

// s0 == A{5} CBA{4} CBA{3} CBA{2} CBA{1} CBA{0}
      // s1 == A{7} CBA{6} CBA{5} CBA{4} CBA{3} CBA{2}
      // s2 == A{9} CBA{8} CBA{7} CBA{6} CBA{5} CBA{4}
      // s3 == A{b} CBA{a} CBA{9} CBA{8} CBA{7} CBA{6}
      // s4 == A{d} CBA{c} CBA{b} CBA{a} CBA{9} CBA{8}
      // s5 == A{f} CBA{e} CBA{d} CBA{c} CBA{b} CBA{a}
      // s6 == A{1} CBA{0} CBA{f} CBA{e} CBA{d} CBA{c}
      // s7 == A{3} CBA{2} CBA{1} CBA{0} CBA{f} CBA{e}
      assign(s0, SL(EX(i1),EX(i0), 0));
      assign(s1, SL(EX(i1),EX(i0), 6));
      assign(s2, SL(EX(i1),EX(i0),12));
      assign(s3, SL(EX(i2),EX(i1), 2));
      assign(s4, SL(EX(i2),EX(i1), 8));
      assign(s5, SL(EX(i2),EX(i1),14));
      assign(s6, SL(EX(i0),EX(i2), 4));
      assign(s7, SL(EX(i0),EX(i2),10));

// t0 == 0--(ten)--0 C1 C0 B1 B0 A1 A0
      // t1 == 0--(ten)--0 C3 C2 B3 B2 A3 A2
      // t2 == 0--(ten)--0 C5 C4 B5 B4 A5 A4
      // t3 == 0--(ten)--0 C7 C6 B7 B6 A7 A6
      // t4 == 0--(ten)--0 C9 C8 B9 B8 A9 A8
      // t5 == 0--(ten)--0 Cb Ca Bb Ba Ab Aa
      // t6 == 0--(ten)--0 Cd Cc Bd Bc Ad Ac
      // t7 == 0--(ten)--0 Cf Ce Bf Be Af Ae
      assign(cMASK, mkV128(0x003F));
      assign(t0, AND( ILO8x16( ROR(EX(s0),3), EX(s0)), EX(cMASK)));
      assign(t1, AND( ILO8x16( ROR(EX(s1),3), EX(s1)), EX(cMASK)));
      assign(t2, AND( ILO8x16( ROR(EX(s2),3), EX(s2)), EX(cMASK)));
      assign(t3, AND( ILO8x16( ROR(EX(s3),3), EX(s3)), EX(cMASK)));
      assign(t4, AND( ILO8x16( ROR(EX(s4),3), EX(s4)), EX(cMASK)));
      assign(t5, AND( ILO8x16( ROR(EX(s5),3), EX(s5)), EX(cMASK)));
      assign(t6, AND( ILO8x16( ROR(EX(s6),3), EX(s6)), EX(cMASK)));
      assign(t7, AND( ILO8x16( ROR(EX(s7),3), EX(s7)), EX(cMASK)));

assign(p0, OR3( SHL(EX(t2),12), SHL(EX(t1),6), EX(t0) ));
      assign(p1, OR4( SHL(EX(t5),14), SHL(EX(t4),8),
                 SHL(EX(t3),2), SHR(EX(t2),4) ));
      assign(p2, OR3( SHL(EX(t7),10), SHL(EX(t6),4), SHR(EX(t5),2) ));

// Then deinterleave at 16x8 granularity.
      math_DEINTERLEAVE3_128(u0, u1, u2, 1, p0, p1, p2);
      return;
   }

/*NOTREACHED*/
   vassert(0);
}

/* Do deinterleaving for 4 128 bit vectors, for LD4 insns. */
static
void math_DEINTERLEAVE4_128( 
        /*OUTx4*/ IRTemp* u0, IRTemp* u1, IRTemp* u2, IRTemp* u3,
        UInt laneSzBlg2,
        IRTemp i0, IRTemp i1, IRTemp i2, IRTemp i3 )
{
   if (laneSzBlg2 == 3) {
      // 64x2
      assign(*u0, ILO64x2(EX(i2), EX(i0)));
      assign(*u1, IHI64x2(EX(i2), EX(i0)));
      assign(*u2, ILO64x2(EX(i3), EX(i1)));
      assign(*u3, IHI64x2(EX(i3), EX(i1)));
      return;
   }
   if (laneSzBlg2 == 2) {
      // 32x4
      IRTemp p0 = newTempV128();
      IRTemp p2 = newTempV128();
      IRTemp p1 = newTempV128();
      IRTemp p3 = newTempV128();
      assign(p0, ILO32x4(EX(i1), EX(i0)));
      assign(p1, IHI32x4(EX(i1), EX(i0)));
      assign(p2, ILO32x4(EX(i3), EX(i2)));
      assign(p3, IHI32x4(EX(i3), EX(i2)));
      // And now do what we did for the 64-bit case.
      math_DEINTERLEAVE4_128(u0, u1, u2, u3, 3, p0, p1, p2, p3);
      return;
   }
   if (laneSzBlg2 == 1) {
      // 16x8
      // Deinterleave into 32-bit chunks, then do as the 32-bit case.
      IRTemp p0 = newTempV128();
      IRTemp p1 = newTempV128();
      IRTemp p2 = newTempV128();
      IRTemp p3 = newTempV128();
      assign(p0, IHI16x8(EX(i0), SHL(EX(i0), 8)));
      assign(p1, IHI16x8(EX(i1), SHL(EX(i1), 8)));
      assign(p2, IHI16x8(EX(i2), SHL(EX(i2), 8)));
      assign(p3, IHI16x8(EX(i3), SHL(EX(i3), 8)));
      // From here on is like the 32 bit case.
      math_DEINTERLEAVE4_128(u0, u1, u2, u3, 2, p0, p1, p2, p3);
      return;
   }
   if (laneSzBlg2 == 0) {
      // 8x16
      // Deinterleave into 16-bit chunks, then do as the 16-bit case.
      IRTemp p0 = newTempV128();
      IRTemp p1 = newTempV128();
      IRTemp p2 = newTempV128();
      IRTemp p3 = newTempV128();
      assign(p0, IHI64x2( IHI8x16(EX(i0),ROL(EX(i0),4)),
                          ILO8x16(EX(i0),ROL(EX(i0),4)) ));
      assign(p1, IHI64x2( IHI8x16(EX(i1),ROL(EX(i1),4)),
                          ILO8x16(EX(i1),ROL(EX(i1),4)) ));
      assign(p2, IHI64x2( IHI8x16(EX(i2),ROL(EX(i2),4)),
                          ILO8x16(EX(i2),ROL(EX(i2),4)) ));
      assign(p3, IHI64x2( IHI8x16(EX(i3),ROL(EX(i3),4)),
                          ILO8x16(EX(i3),ROL(EX(i3),4)) ));
      // From here on is like the 16 bit case.
      math_DEINTERLEAVE4_128(u0, u1, u2, u3, 1, p0, p1, p2, p3);
      return;
   }
   /*NOTREACHED*/
   vassert(0);
}

/* Wrappers that use the full-width (de)interleavers to do half-width
   (de)interleaving.  The scheme is to clone each input lane in the
   lower half of each incoming value, do a full width (de)interleave
   at the next lane size up, and remove every other lane of the the
   result.  The returned values may have any old junk in the upper
   64 bits -- the caller must ignore that. */

/* Helper function -- get doubling and narrowing operations. */
static
void math_get_doubler_and_halver ( /*OUT*/IROp* doubler,
                                   /*OUT*/IROp* halver,
                                   UInt laneSzBlg2 )
{
   switch (laneSzBlg2) {
      case 2:
         *doubler = Iop_InterleaveLO32x4; *halver = Iop_CatEvenLanes32x4;
         break;
      case 1:
         *doubler = Iop_InterleaveLO16x8; *halver = Iop_CatEvenLanes16x8;
         break;
      case 0:
         *doubler = Iop_InterleaveLO8x16; *halver = Iop_CatEvenLanes8x16;
         break;
      default:
         vassert(0);
   }
}

/* Do interleaving for 1 64 bit vector, for ST1 insns. */
static
void math_INTERLEAVE1_64( /*OUTx1*/ IRTemp* i0,
                          UInt laneSzBlg2, IRTemp u0 )
{
   assign(*i0, mkexpr(u0));
}

/* Do interleaving for 2 64 bit vectors, for ST2 insns. */
static
void math_INTERLEAVE2_64( /*OUTx2*/ IRTemp* i0, IRTemp* i1,
                          UInt laneSzBlg2, IRTemp u0, IRTemp u1 )
{
   if (laneSzBlg2 == 3) {
      // 1x64, degenerate case
      assign(*i0, EX(u0));
      assign(*i1, EX(u1));
      return;
   }