Vladimir Prus Julian Brown gcc/ * config/arm/arm.c (arm_override_options): Warn if mlow-irq-latency is specified in Thumb mode. (load_multiple_sequence): Return 0 if low irq latency is requested. (store_multiple_sequence): Likewise. (arm_gen_load_multiple): Load registers one-by-one if low irq latency is requested. (arm_gen_store_multiple): Likewise. (vfp_output_fldmd): When low_irq_latency is non zero, pop each register separately. (vfp_emit_fstmd): When low_irq_latency is non zero, save each register separately. (arm_get_vfp_saved_size): Adjust saved register size calculation for the above changes. (print_pop_reg_by_ldr): New. (arm_output_epilogue): Use print_pop_reg_by_ldr when low irq latency is requested. (emit_multi_reg_push): Push registers separately if low irq latency is requested. * config/arm/arm.h (TARGET_CPU_CPP_BUILTINS): Set __low_irq_latency__. (low_irq_latency): Define. (USE_RETURN_INSN): Don't use return insn when low irq latency is requested. * config/arm/lib1funcs.asm (do_pop, do_push): Define as variadic macros. When __low_irq_latency__ is defined, push and pop registers individually. (div0): Use correct punctuation. * config/arm/ieee754-df.S: Adjust syntax of using do_push. * config/arm/ieee754-sf.S: Likewise. * config/arm/bpabi.S: Likewise. * config/arm/arm.opt (mlow-irq-latency): New option. * config/arm/predicates.md (load_multiple_operation): Return false is low irq latency is requested. (store_multiple_operation): Likewise. * config/arm/arm.md (movmemqi): Don't use it if low irq latency is requested. * doc/invoke.texi (-mlow-irq-latency): Add documentation. 2010-07-26 Julian Brown Merge from Sourcery G++ 4.4: 2007-06-06 Joseph Myers gcc/ === modified file 'gcc/config/arm/arm.c' --- old/gcc/config/arm/arm.c 2010-08-05 14:33:31 +0000 +++ new/gcc/config/arm/arm.c 2010-08-05 15:20:54 +0000 @@ -1884,6 +1884,13 @@ /* Register global variables with the garbage collector. */ arm_add_gc_roots (); + + if (low_irq_latency && TARGET_THUMB) + { + warning (0, + "-mlow-irq-latency has no effect when compiling for Thumb"); + low_irq_latency = 0; + } } static void @@ -9053,6 +9060,9 @@ int base_reg = -1; int i; + if (low_irq_latency) + return 0; + /* Can only handle 2, 3, or 4 insns at present, though could be easily extended if required. */ gcc_assert (nops >= 2 && nops <= 4); @@ -9282,6 +9292,9 @@ int base_reg = -1; int i; + if (low_irq_latency) + return 0; + /* Can only handle 2, 3, or 4 insns at present, though could be easily extended if required. */ gcc_assert (nops >= 2 && nops <= 4); @@ -9489,7 +9502,7 @@ As a compromise, we use ldr for counts of 1 or 2 regs, and ldm for counts of 3 or 4 regs. */ - if (arm_tune_xscale && count <= 2 && ! optimize_size) + if (low_irq_latency || (arm_tune_xscale && count <= 2 && ! optimize_size)) { rtx seq; @@ -9552,7 +9565,7 @@ /* See arm_gen_load_multiple for discussion of the pros/cons of ldm/stm usage for XScale. */ - if (arm_tune_xscale && count <= 2 && ! optimize_size) + if (low_irq_latency || (arm_tune_xscale && count <= 2 && ! optimize_size)) { rtx seq; @@ -11795,6 +11808,21 @@ vfp_output_fldmd (FILE * stream, unsigned int base, int reg, int count) { int i; + int offset; + + if (low_irq_latency) + { + /* Output a sequence of FLDD instructions. */ + offset = 0; + for (i = reg; i < reg + count; ++i, offset += 8) + { + fputc ('\t', stream); + asm_fprintf (stream, "fldd\td%d, [%r,#%d]\n", i, base, offset); + } + asm_fprintf (stream, "\tadd\tsp, sp, #%d\n", count * 8); + return; + } + /* Workaround ARM10 VFPr1 bug. */ if (count == 2 && !arm_arch6) @@ -11865,6 +11893,56 @@ rtx tmp, reg; int i; + if (low_irq_latency) + { + int saved_size; + rtx sp_insn; + + if (!count) + return 0; + + saved_size = count * GET_MODE_SIZE (DFmode); + + /* Since fstd does not have postdecrement addressing mode, + we first decrement stack pointer and then use base+offset + stores for VFP registers. The ARM EABI unwind information + can't easily describe base+offset loads, so we attach + a note for the effects of the whole block in the first insn, + and avoid marking the subsequent instructions + with RTX_FRAME_RELATED_P. */ + sp_insn = gen_addsi3 (stack_pointer_rtx, stack_pointer_rtx, + GEN_INT (-saved_size)); + sp_insn = emit_insn (sp_insn); + RTX_FRAME_RELATED_P (sp_insn) = 1; + + dwarf = gen_rtx_SEQUENCE (VOIDmode, rtvec_alloc (count + 1)); + XVECEXP (dwarf, 0, 0) = + gen_rtx_SET (VOIDmode, stack_pointer_rtx, + plus_constant (stack_pointer_rtx, -saved_size)); + + /* push double VFP registers to stack */ + for (i = 0; i < count; ++i ) + { + rtx reg; + rtx mem; + rtx addr; + rtx insn; + reg = gen_rtx_REG (DFmode, base_reg + 2*i); + addr = (i == 0) ? stack_pointer_rtx + : gen_rtx_PLUS (SImode, stack_pointer_rtx, + GEN_INT (i * GET_MODE_SIZE (DFmode))); + mem = gen_frame_mem (DFmode, addr); + insn = emit_move_insn (mem, reg); + XVECEXP (dwarf, 0, i+1) = + gen_rtx_SET (VOIDmode, mem, reg); + } + + REG_NOTES (sp_insn) = gen_rtx_EXPR_LIST (REG_FRAME_RELATED_EXPR, dwarf, + REG_NOTES (sp_insn)); + + return saved_size; + } + /* Workaround ARM10 VFPr1 bug. Data corruption can occur when exactly two register pairs are stored by a store multiple insn. We avoid this by pushing an extra pair. */ @@ -13307,7 +13385,7 @@ if (count > 0) { /* Workaround ARM10 VFPr1 bug. */ - if (count == 2 && !arm_arch6) + if (count == 2 && !arm_arch6 && !low_irq_latency) count++; saved += count * 8; } @@ -13645,6 +13723,41 @@ } +/* Generate to STREAM a code sequence that pops registers identified + in REGS_MASK from SP. SP is incremented as the result. +*/ +static void +print_pop_reg_by_ldr (FILE *stream, int regs_mask, int rfe) +{ + int reg; + + gcc_assert (! (regs_mask & (1 << SP_REGNUM))); + + for (reg = 0; reg < PC_REGNUM; ++reg) + if (regs_mask & (1 << reg)) + asm_fprintf (stream, "\tldr\t%r, [%r], #4\n", + reg, SP_REGNUM); + + if (regs_mask & (1 << PC_REGNUM)) + { + if (rfe) + /* When returning from exception, we need to + copy SPSR to CPSR. There are two ways to do + that: the ldm instruction with "^" suffix, + and movs instruction. The latter would + require that we load from stack to some + scratch register, and then move to PC. + Therefore, we'd need extra instruction and + have to make sure we actually have a spare + register. Using ldm with a single register + is simler. */ + asm_fprintf (stream, "\tldm\tsp!, {pc}^\n"); + else + asm_fprintf (stream, "\tldr\t%r, [%r], #4\n", + PC_REGNUM, SP_REGNUM); + } +} + const char * arm_output_epilogue (rtx sibling) { @@ -14018,22 +14131,19 @@ to load use the LDR instruction - it is faster. For Thumb-2 always use pop and the assembler will pick the best instruction.*/ if (TARGET_ARM && saved_regs_mask == (1 << LR_REGNUM) - && !IS_INTERRUPT(func_type)) + && !IS_INTERRUPT (func_type)) { asm_fprintf (f, "\tldr\t%r, [%r], #4\n", LR_REGNUM, SP_REGNUM); } else if (saved_regs_mask) { - if (saved_regs_mask & (1 << SP_REGNUM)) - /* Note - write back to the stack register is not enabled - (i.e. "ldmfd sp!..."). We know that the stack pointer is - in the list of registers and if we add writeback the - instruction becomes UNPREDICTABLE. */ - print_multi_reg (f, "ldmfd\t%r, ", SP_REGNUM, saved_regs_mask, - rfe); - else if (TARGET_ARM) - print_multi_reg (f, "ldmfd\t%r!, ", SP_REGNUM, saved_regs_mask, - rfe); + gcc_assert ( ! (saved_regs_mask & (1 << SP_REGNUM))); + if (TARGET_ARM) + if (low_irq_latency) + print_pop_reg_by_ldr (f, saved_regs_mask, rfe); + else + print_multi_reg (f, "ldmfd\t%r!, ", SP_REGNUM, saved_regs_mask, + rfe); else print_multi_reg (f, "pop\t", SP_REGNUM, saved_regs_mask, 0); } @@ -14154,6 +14264,32 @@ gcc_assert (num_regs && num_regs <= 16); + if (low_irq_latency) + { + rtx insn = 0; + + /* Emit a series of ldr instructions rather rather than a single ldm. */ + /* TODO: Use ldrd where possible. */ + gcc_assert (! (mask & (1 << SP_REGNUM))); + + for (i = LAST_ARM_REGNUM; i >= 0; --i) + { + if (mask & (1 << i)) + + { + rtx reg, where, mem; + + reg = gen_rtx_REG (SImode, i); + where = gen_rtx_PRE_DEC (SImode, stack_pointer_rtx); + mem = gen_rtx_MEM (SImode, where); + insn = emit_move_insn (mem, reg); + RTX_FRAME_RELATED_P (insn) = 1; + } + } + + return insn; + } + /* We don't record the PC in the dwarf frame information. */ num_dwarf_regs = num_regs; if (mask & (1 << PC_REGNUM)) === modified file 'gcc/config/arm/arm.h' --- old/gcc/config/arm/arm.h 2010-08-05 14:33:31 +0000 +++ new/gcc/config/arm/arm.h 2010-08-05 15:20:54 +0000 @@ -101,6 +101,8 @@ builtin_define ("__ARM_PCS"); \ builtin_define ("__ARM_EABI__"); \ } \ + if (low_irq_latency) \ + builtin_define ("__low_irq_latency__"); \ } while (0) /* The various ARM cores. */ @@ -449,6 +451,10 @@ /* Nonzero if chip supports integer division instruction. */ extern int arm_arch_hwdiv; +/* Nonzero if we should minimize interrupt latency of the + generated code. */ +extern int low_irq_latency; + #ifndef TARGET_DEFAULT #define TARGET_DEFAULT (MASK_APCS_FRAME) #endif @@ -1823,9 +1829,10 @@ /* Determine if the epilogue should be output as RTL. You should override this if you define FUNCTION_EXTRA_EPILOGUE. */ /* This is disabled for Thumb-2 because it will confuse the - conditional insn counter. */ + conditional insn counter. + Do not use a return insn if we're avoiding ldm/stm instructions. */ #define USE_RETURN_INSN(ISCOND) \ - (TARGET_ARM ? use_return_insn (ISCOND, NULL) : 0) + ((TARGET_ARM && !low_irq_latency) ? use_return_insn (ISCOND, NULL) : 0) /* Definitions for register eliminations. === modified file 'gcc/config/arm/arm.md' --- old/gcc/config/arm/arm.md 2010-08-05 12:06:40 +0000 +++ new/gcc/config/arm/arm.md 2010-08-05 15:20:54 +0000 @@ -6587,7 +6587,7 @@ (match_operand:BLK 1 "general_operand" "") (match_operand:SI 2 "const_int_operand" "") (match_operand:SI 3 "const_int_operand" "")] - "TARGET_EITHER" + "TARGET_EITHER && !low_irq_latency" " if (TARGET_32BIT) { === modified file 'gcc/config/arm/arm.opt' --- old/gcc/config/arm/arm.opt 2009-06-18 11:24:10 +0000 +++ new/gcc/config/arm/arm.opt 2010-08-05 15:20:54 +0000 @@ -161,6 +161,10 @@ Target Report Mask(NEON_VECTORIZE_QUAD) Use Neon quad-word (rather than double-word) registers for vectorization +mlow-irq-latency +Target Report Var(low_irq_latency) +Try to reduce interrupt latency of the generated code + mword-relocations Target Report Var(target_word_relocations) Init(TARGET_DEFAULT_WORD_RELOCATIONS) Only generate absolute relocations on word sized values. === modified file 'gcc/config/arm/bpabi.S' --- old/gcc/config/arm/bpabi.S 2009-12-17 15:37:23 +0000 +++ new/gcc/config/arm/bpabi.S 2010-08-05 15:20:54 +0000 @@ -116,16 +116,17 @@ test_div_by_zero signed sub sp, sp, #8 -#if defined(__thumb2__) +/* Low latency and Thumb-2 do_push implementations can't push sp directly. */ +#if defined(__thumb2__) || defined(__irq_low_latency__) mov ip, sp - push {ip, lr} + do_push (ip, lr) #else - do_push {sp, lr} + stmfd sp!, {sp, lr} #endif bl SYM(__gnu_ldivmod_helper) __PLT__ ldr lr, [sp, #4] add sp, sp, #8 - do_pop {r2, r3} + do_pop (r2, r3) RET #endif /* L_aeabi_ldivmod */ @@ -136,16 +137,17 @@ test_div_by_zero unsigned sub sp, sp, #8 -#if defined(__thumb2__) +/* Low latency and Thumb-2 do_push implementations can't push sp directly. */ +#if defined(__thumb2__) || defined(__irq_low_latency__) mov ip, sp - push {ip, lr} + do_push (ip, lr) #else - do_push {sp, lr} + stmfd sp!, {sp, lr} #endif bl SYM(__gnu_uldivmod_helper) __PLT__ ldr lr, [sp, #4] add sp, sp, #8 - do_pop {r2, r3} + do_pop (r2, r3) RET #endif /* L_aeabi_divmod */ === modified file 'gcc/config/arm/ieee754-df.S' --- old/gcc/config/arm/ieee754-df.S 2009-06-05 12:52:36 +0000 +++ new/gcc/config/arm/ieee754-df.S 2010-08-05 15:20:54 +0000 @@ -83,7 +83,7 @@ ARM_FUNC_START adddf3 ARM_FUNC_ALIAS aeabi_dadd adddf3 -1: do_push {r4, r5, lr} +1: do_push (r4, r5, lr) @ Look for zeroes, equal values, INF, or NAN. shift1 lsl, r4, xh, #1 @@ -427,7 +427,7 @@ do_it eq, t moveq r1, #0 RETc(eq) - do_push {r4, r5, lr} + do_push (r4, r5, lr) mov r4, #0x400 @ initial exponent add r4, r4, #(52-1 - 1) mov r5, #0 @ sign bit is 0 @@ -447,7 +447,7 @@ do_it eq, t moveq r1, #0 RETc(eq) - do_push {r4, r5, lr} + do_push (r4, r5, lr) mov r4, #0x400 @ initial exponent add r4, r4, #(52-1 - 1) ands r5, r0, #0x80000000 @ sign bit in r5 @@ -481,7 +481,7 @@ RETc(eq) @ we are done already. @ value was denormalized. We can normalize it now. - do_push {r4, r5, lr} + do_push (r4, r5, lr) mov r4, #0x380 @ setup corresponding exponent and r5, xh, #0x80000000 @ move sign bit in r5 bic xh, xh, #0x80000000 @@ -508,9 +508,9 @@ @ compatibility. adr ip, LSYM(f0_ret) @ Push pc as well so that RETLDM works correctly. - do_push {r4, r5, ip, lr, pc} + do_push (r4, r5, ip, lr, pc) #else - do_push {r4, r5, lr} + do_push (r4, r5, lr) #endif mov r5, #0 @@ -534,9 +534,9 @@ @ compatibility. adr ip, LSYM(f0_ret) @ Push pc as well so that RETLDM works correctly. - do_push {r4, r5, ip, lr, pc} + do_push (r4, r5, ip, lr, pc) #else - do_push {r4, r5, lr} + do_push (r4, r5, lr) #endif ands r5, ah, #0x80000000 @ sign bit in r5 @@ -585,7 +585,7 @@ @ Legacy code expects the result to be returned in f0. Copy it @ there as well. LSYM(f0_ret): - do_push {r0, r1} + do_push (r0, r1) ldfd f0, [sp], #8 RETLDM @@ -602,7 +602,7 @@ ARM_FUNC_START muldf3 ARM_FUNC_ALIAS aeabi_dmul muldf3 - do_push {r4, r5, r6, lr} + do_push (r4, r5, r6, lr) @ Mask out exponents, trap any zero/denormal/INF/NAN. mov ip, #0xff @@ -910,7 +910,7 @@ ARM_FUNC_START divdf3 ARM_FUNC_ALIAS aeabi_ddiv divdf3 - do_push {r4, r5, r6, lr} + do_push (r4, r5, r6, lr) @ Mask out exponents, trap any zero/denormal/INF/NAN. mov ip, #0xff @@ -1195,7 +1195,7 @@ @ The status-returning routines are required to preserve all @ registers except ip, lr, and cpsr. -6: do_push {r0, lr} +6: do_push (r0, lr) ARM_CALL cmpdf2 @ Set the Z flag correctly, and the C flag unconditionally. cmp r0, #0 === modified file 'gcc/config/arm/ieee754-sf.S' --- old/gcc/config/arm/ieee754-sf.S 2009-06-05 12:52:36 +0000 +++ new/gcc/config/arm/ieee754-sf.S 2010-08-05 15:20:54 +0000 @@ -481,7 +481,7 @@ and r3, ip, #0x80000000 @ Well, no way to make it shorter without the umull instruction. - do_push {r3, r4, r5} + do_push (r3, r4, r5) mov r4, r0, lsr #16 mov r5, r1, lsr #16 bic r0, r0, r4, lsl #16 @@ -492,7 +492,7 @@ mla r0, r4, r1, r0 adds r3, r3, r0, lsl #16 adc r1, ip, r0, lsr #16 - do_pop {r0, r4, r5} + do_pop (r0, r4, r5) #else @@ -882,7 +882,7 @@ @ The status-returning routines are required to preserve all @ registers except ip, lr, and cpsr. -6: do_push {r0, r1, r2, r3, lr} +6: do_push (r0, r1, r2, r3, lr) ARM_CALL cmpsf2 @ Set the Z flag correctly, and the C flag unconditionally. cmp r0, #0 === modified file 'gcc/config/arm/lib1funcs.asm' --- old/gcc/config/arm/lib1funcs.asm 2010-04-02 18:54:46 +0000 +++ new/gcc/config/arm/lib1funcs.asm 2010-08-05 15:20:54 +0000 @@ -254,8 +254,8 @@ .macro shift1 op, arg0, arg1, arg2 \op \arg0, \arg1, \arg2 .endm -#define do_push push -#define do_pop pop +#define do_push(...) push {__VA_ARGS__} +#define do_pop(...) pop {__VA_ARGS__} #define COND(op1, op2, cond) op1 ## op2 ## cond /* Perform an arithmetic operation with a variable shift operand. This requires two instructions and a scratch register on Thumb-2. */ @@ -269,8 +269,42 @@ .macro shift1 op, arg0, arg1, arg2 mov \arg0, \arg1, \op \arg2 .endm -#define do_push stmfd sp!, -#define do_pop ldmfd sp!, +#if defined(__low_irq_latency__) +#define do_push(...) \ + _buildN1(do_push, _buildC1(__VA_ARGS__))( __VA_ARGS__) +#define _buildN1(BASE, X) _buildN2(BASE, X) +#define _buildN2(BASE, X) BASE##X +#define _buildC1(...) _buildC2(__VA_ARGS__,9,8,7,6,5,4,3,2,1) +#define _buildC2(a1,a2,a3,a4,a5,a6,a7,a8,a9,c,...) c + +#define do_push1(r1) str r1, [sp, #-4]! +#define do_push2(r1, r2) str r2, [sp, #-4]! ; str r1, [sp, #-4]! +#define do_push3(r1, r2, r3) str r3, [sp, #-4]! ; str r2, [sp, #-4]!; str r1, [sp, #-4]! +#define do_push4(r1, r2, r3, r4) \ + do_push3 (r2, r3, r4);\ + do_push1 (r1) +#define do_push5(r1, r2, r3, r4, r5) \ + do_push4 (r2, r3, r4, r5);\ + do_push1 (r1) + +#define do_pop(...) \ +_buildN1(do_pop, _buildC1(__VA_ARGS__))( __VA_ARGS__) + +#define do_pop1(r1) ldr r1, [sp], #4 +#define do_pop2(r1, r2) ldr r1, [sp], #4 ; ldr r2, [sp], #4 +#define do_pop3(r1, r2, r3) ldr r1, [sp], #4 ; str r2, [sp], #4; str r3, [sp], #4 +#define do_pop4(r1, r2, r3, r4) \ + do_pop1 (r1);\ + do_pup3 (r2, r3, r4) +#define do_pop5(r1, r2, r3, r4, r5) \ + do_pop1 (r1);\ + do_pop4 (r2, r3, r4, r5) +#else +#define do_push(...) stmfd sp!, { __VA_ARGS__} +#define do_pop(...) ldmfd sp!, {__VA_ARGS__} +#endif + + #define COND(op1, op2, cond) op1 ## cond ## op2 .macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp \name \dest, \src1, \src2, \shiftop \shiftreg @@ -1260,7 +1294,7 @@ ARM_FUNC_START div0 #endif - do_push {r1, lr} + do_push (r1, lr) mov r0, #SIGFPE bl SYM(raise) __PLT__ RETLDM r1 @@ -1277,7 +1311,7 @@ #if defined __ARM_EABI__ && defined __linux__ @ EABI GNU/Linux call to cacheflush syscall. ARM_FUNC_START clear_cache - do_push {r7} + do_push (r7) #if __ARM_ARCH__ >= 7 || defined(__ARM_ARCH_6T2__) movw r7, #2 movt r7, #0xf @@ -1287,7 +1321,7 @@ #endif mov r2, #0 swi 0 - do_pop {r7} + do_pop (r7) RET FUNC_END clear_cache #else @@ -1490,7 +1524,7 @@ push {r4, lr} # else ARM_FUNC_START clzdi2 - do_push {r4, lr} + do_push (r4, lr) # endif cmp xxh, #0 bne 1f === modified file 'gcc/config/arm/predicates.md' --- old/gcc/config/arm/predicates.md 2010-07-30 14:17:05 +0000 +++ new/gcc/config/arm/predicates.md 2010-08-05 15:20:54 +0000 @@ -328,6 +328,9 @@ HOST_WIDE_INT i = 1, base = 0; rtx elt; + if (low_irq_latency) + return false; + if (count <= 1 || GET_CODE (XVECEXP (op, 0, 0)) != SET) return false; @@ -385,6 +388,9 @@ HOST_WIDE_INT i = 1, base = 0; rtx elt; + if (low_irq_latency) + return false; + if (count <= 1 || GET_CODE (XVECEXP (op, 0, 0)) != SET) return false; === modified file 'gcc/doc/invoke.texi' --- old/gcc/doc/invoke.texi 2010-07-29 17:03:20 +0000 +++ new/gcc/doc/invoke.texi 2010-08-05 15:20:54 +0000 @@ -469,6 +469,7 @@ -mtpcs-frame -mtpcs-leaf-frame @gol -mcaller-super-interworking -mcallee-super-interworking @gol -mtp=@var{name} @gol +-mlow-irq-latency @gol -mword-relocations @gol -mfix-cortex-m3-ldrd} @@ -9489,6 +9490,12 @@ @code{,}, @code{!}, @code{|}, and @code{*} as needed. +@item -mlow-irq-latency +@opindex mlow-irq-latency +Avoid instructions with high interrupt latency when generating +code. This can increase code size and reduce performance. +The option is off by default. + @end table The conditional text @code{X} in a %@{@code{S}:@code{X}@} or similar