2010-07-15 Sandra Loosemore Backport from mainline: 2010-06-09 Sandra Loosemore gcc/ * tree-ssa-loop-ivopts.c (adjust_setup_cost): New function. (get_computation_cost_at): Use it. (determine_use_iv_cost_condition): Likewise. (determine_iv_cost): Likewise. 2010-07-05 Sandra Loosemore PR middle-end/42505 gcc/ * tree-ssa-loop-ivopts.c (determine_set_costs): Delete obsolete comments about cost model. (try_add_cand_for): Add second strategy for choosing initial set based on original IVs, controlled by ORIGINALP argument. (get_initial_solution): Add ORIGINALP argument. (find_optimal_iv_set_1): New function, split from find_optimal_iv_set. (find_optimal_iv_set): Try two different strategies for choosing the IV set, and return the one with lower cost. gcc/testsuite/ * gcc.target/arm/pr42505.c: New test case. 2010-07-10 Sandra Loosemore PR middle-end/42505 gcc/ * tree-inline.c (estimate_num_insns): Refactor builtin complexity lookup code into.... * builtins.c (is_simple_builtin, is_inexpensive_builtin): ...these new functions. * tree.h (is_simple_builtin, is_inexpensive_builtin): Declare. * cfgloopanal.c (target_clobbered_regs): Define. (init_set_costs): Initialize target_clobbered_regs. (estimate_reg_pressure_cost): Add call_p argument. When true, adjust the number of available registers to exclude the call-clobbered registers. * cfgloop.h (target_clobbered_regs): Declare. (estimate_reg_pressure_cost): Adjust declaration. * tree-ssa-loop-ivopts.c (struct ivopts_data): Add body_includes_call. (ivopts_global_cost_for_size): Pass it to estimate_reg_pressure_cost. (determine_set_costs): Dump target_clobbered_regs. (loop_body_includes_call): New function. (tree_ssa_iv_optimize_loop): Use it to initialize new field. * loop-invariant.c (gain_for_invariant): Adjust arguments to pass call_p flag through. (best_gain_for_invariant): Likewise. (find_invariants_to_move): Likewise. (move_single_loop_invariants): Likewise, using already-computed has_call field. 2010-07-15 Jie Zhang Issue #8497, #8893 === modified file 'gcc/builtins.c' --- old/gcc/builtins.c 2010-04-13 12:47:11 +0000 +++ new/gcc/builtins.c 2010-08-02 13:51:23 +0000 @@ -13624,3 +13624,123 @@ break; } } + +/* Return true if DECL is a builtin that expands to a constant or similarly + simple code. */ +bool +is_simple_builtin (tree decl) +{ + if (decl && DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL) + switch (DECL_FUNCTION_CODE (decl)) + { + /* Builtins that expand to constants. */ + case BUILT_IN_CONSTANT_P: + case BUILT_IN_EXPECT: + case BUILT_IN_OBJECT_SIZE: + case BUILT_IN_UNREACHABLE: + /* Simple register moves or loads from stack. */ + case BUILT_IN_RETURN_ADDRESS: + case BUILT_IN_EXTRACT_RETURN_ADDR: + case BUILT_IN_FROB_RETURN_ADDR: + case BUILT_IN_RETURN: + case BUILT_IN_AGGREGATE_INCOMING_ADDRESS: + case BUILT_IN_FRAME_ADDRESS: + case BUILT_IN_VA_END: + case BUILT_IN_STACK_SAVE: + case BUILT_IN_STACK_RESTORE: + /* Exception state returns or moves registers around. */ + case BUILT_IN_EH_FILTER: + case BUILT_IN_EH_POINTER: + case BUILT_IN_EH_COPY_VALUES: + return true; + + default: + return false; + } + + return false; +} + +/* Return true if DECL is a builtin that is not expensive, i.e., they are + most probably expanded inline into reasonably simple code. This is a + superset of is_simple_builtin. */ +bool +is_inexpensive_builtin (tree decl) +{ + if (!decl) + return false; + else if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_MD) + return true; + else if (DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL) + switch (DECL_FUNCTION_CODE (decl)) + { + case BUILT_IN_ABS: + case BUILT_IN_ALLOCA: + case BUILT_IN_BSWAP32: + case BUILT_IN_BSWAP64: + case BUILT_IN_CLZ: + case BUILT_IN_CLZIMAX: + case BUILT_IN_CLZL: + case BUILT_IN_CLZLL: + case BUILT_IN_CTZ: + case BUILT_IN_CTZIMAX: + case BUILT_IN_CTZL: + case BUILT_IN_CTZLL: + case BUILT_IN_FFS: + case BUILT_IN_FFSIMAX: + case BUILT_IN_FFSL: + case BUILT_IN_FFSLL: + case BUILT_IN_IMAXABS: + case BUILT_IN_FINITE: + case BUILT_IN_FINITEF: + case BUILT_IN_FINITEL: + case BUILT_IN_FINITED32: + case BUILT_IN_FINITED64: + case BUILT_IN_FINITED128: + case BUILT_IN_FPCLASSIFY: + case BUILT_IN_ISFINITE: + case BUILT_IN_ISINF_SIGN: + case BUILT_IN_ISINF: + case BUILT_IN_ISINFF: + case BUILT_IN_ISINFL: + case BUILT_IN_ISINFD32: + case BUILT_IN_ISINFD64: + case BUILT_IN_ISINFD128: + case BUILT_IN_ISNAN: + case BUILT_IN_ISNANF: + case BUILT_IN_ISNANL: + case BUILT_IN_ISNAND32: + case BUILT_IN_ISNAND64: + case BUILT_IN_ISNAND128: + case BUILT_IN_ISNORMAL: + case BUILT_IN_ISGREATER: + case BUILT_IN_ISGREATEREQUAL: + case BUILT_IN_ISLESS: + case BUILT_IN_ISLESSEQUAL: + case BUILT_IN_ISLESSGREATER: + case BUILT_IN_ISUNORDERED: + case BUILT_IN_VA_ARG_PACK: + case BUILT_IN_VA_ARG_PACK_LEN: + case BUILT_IN_VA_COPY: + case BUILT_IN_TRAP: + case BUILT_IN_SAVEREGS: + case BUILT_IN_POPCOUNTL: + case BUILT_IN_POPCOUNTLL: + case BUILT_IN_POPCOUNTIMAX: + case BUILT_IN_POPCOUNT: + case BUILT_IN_PARITYL: + case BUILT_IN_PARITYLL: + case BUILT_IN_PARITYIMAX: + case BUILT_IN_PARITY: + case BUILT_IN_LABS: + case BUILT_IN_LLABS: + case BUILT_IN_PREFETCH: + return true; + + default: + return is_simple_builtin (decl); + } + + return false; +} + === modified file 'gcc/cfgloop.h' --- old/gcc/cfgloop.h 2009-11-25 10:55:54 +0000 +++ new/gcc/cfgloop.h 2010-08-02 13:51:23 +0000 @@ -622,13 +622,14 @@ /* The properties of the target. */ extern unsigned target_avail_regs; +extern unsigned target_clobbered_regs; extern unsigned target_res_regs; extern unsigned target_reg_cost [2]; extern unsigned target_spill_cost [2]; /* Register pressure estimation for induction variable optimizations & loop invariant motion. */ -extern unsigned estimate_reg_pressure_cost (unsigned, unsigned, bool); +extern unsigned estimate_reg_pressure_cost (unsigned, unsigned, bool, bool); extern void init_set_costs (void); /* Loop optimizer initialization. */ === modified file 'gcc/cfgloopanal.c' --- old/gcc/cfgloopanal.c 2009-09-30 08:57:56 +0000 +++ new/gcc/cfgloopanal.c 2010-08-02 13:51:23 +0000 @@ -320,6 +320,8 @@ /* The properties of the target. */ unsigned target_avail_regs; /* Number of available registers. */ +unsigned target_clobbered_regs; /* Number of available registers that are + call-clobbered. */ unsigned target_res_regs; /* Number of registers reserved for temporary expressions. */ unsigned target_reg_cost[2]; /* The cost for register when there still @@ -342,10 +344,15 @@ unsigned i; target_avail_regs = 0; + target_clobbered_regs = 0; for (i = 0; i < FIRST_PSEUDO_REGISTER; i++) if (TEST_HARD_REG_BIT (reg_class_contents[GENERAL_REGS], i) && !fixed_regs[i]) - target_avail_regs++; + { + target_avail_regs++; + if (call_used_regs[i]) + target_clobbered_regs++; + } target_res_regs = 3; @@ -379,20 +386,29 @@ /* Estimates cost of increased register pressure caused by making N_NEW new registers live around the loop. N_OLD is the number of registers live - around the loop. */ + around the loop. If CALL_P is true, also take into account that + call-used registers may be clobbered in the loop body, reducing the + number of available registers before we spill. */ unsigned -estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed) +estimate_reg_pressure_cost (unsigned n_new, unsigned n_old, bool speed, + bool call_p) { unsigned cost; unsigned regs_needed = n_new + n_old; + unsigned available_regs = target_avail_regs; + + /* If there is a call in the loop body, the call-clobbered registers + are not available for loop invariants. */ + if (call_p) + available_regs = available_regs - target_clobbered_regs; /* If we have enough registers, we should use them and not restrict the transformations unnecessarily. */ - if (regs_needed + target_res_regs <= target_avail_regs) + if (regs_needed + target_res_regs <= available_regs) return 0; - if (regs_needed <= target_avail_regs) + if (regs_needed <= available_regs) /* If we are close to running out of registers, try to preserve them. */ cost = target_reg_cost [speed] * n_new; === modified file 'gcc/loop-invariant.c' --- old/gcc/loop-invariant.c 2010-04-02 18:54:46 +0000 +++ new/gcc/loop-invariant.c 2010-08-02 13:51:23 +0000 @@ -1173,11 +1173,13 @@ /* Calculates gain for eliminating invariant INV. REGS_USED is the number of registers used in the loop, NEW_REGS is the number of new variables already added due to the invariant motion. The number of registers needed - for it is stored in *REGS_NEEDED. */ + for it is stored in *REGS_NEEDED. SPEED and CALL_P are flags passed + through to estimate_reg_pressure_cost. */ static int gain_for_invariant (struct invariant *inv, unsigned *regs_needed, - unsigned *new_regs, unsigned regs_used, bool speed) + unsigned *new_regs, unsigned regs_used, + bool speed, bool call_p) { int comp_cost, size_cost; @@ -1188,9 +1190,9 @@ if (! flag_ira_loop_pressure) { size_cost = (estimate_reg_pressure_cost (new_regs[0] + regs_needed[0], - regs_used, speed) + regs_used, speed, call_p) - estimate_reg_pressure_cost (new_regs[0], - regs_used, speed)); + regs_used, speed, call_p)); } else { @@ -1245,7 +1247,8 @@ static int best_gain_for_invariant (struct invariant **best, unsigned *regs_needed, - unsigned *new_regs, unsigned regs_used, bool speed) + unsigned *new_regs, unsigned regs_used, + bool speed, bool call_p) { struct invariant *inv; int i, gain = 0, again; @@ -1261,7 +1264,7 @@ continue; again = gain_for_invariant (inv, aregs_needed, new_regs, regs_used, - speed); + speed, call_p); if (again > gain) { gain = again; @@ -1314,7 +1317,7 @@ /* Determines which invariants to move. */ static void -find_invariants_to_move (bool speed) +find_invariants_to_move (bool speed, bool call_p) { int gain; unsigned i, regs_used, regs_needed[N_REG_CLASSES], new_regs[N_REG_CLASSES]; @@ -1353,7 +1356,8 @@ new_regs[ira_reg_class_cover[i]] = 0; } while ((gain = best_gain_for_invariant (&inv, regs_needed, - new_regs, regs_used, speed)) > 0) + new_regs, regs_used, + speed, call_p)) > 0) { set_move_mark (inv->invno, gain); if (! flag_ira_loop_pressure) @@ -1554,7 +1558,8 @@ init_inv_motion_data (); find_invariants (loop); - find_invariants_to_move (optimize_loop_for_speed_p (loop)); + find_invariants_to_move (optimize_loop_for_speed_p (loop), + LOOP_DATA (loop)->has_call); move_invariants (loop); free_inv_motion_data (); === added file 'gcc/testsuite/gcc.target/arm/pr42505.c' --- old/gcc/testsuite/gcc.target/arm/pr42505.c 1970-01-01 00:00:00 +0000 +++ new/gcc/testsuite/gcc.target/arm/pr42505.c 2010-08-02 13:51:23 +0000 @@ -0,0 +1,23 @@ +/* { dg-options "-mthumb -Os -march=armv5te" } */ +/* { dg-require-effective-target arm_thumb1_ok } */ +/* { dg-final { scan-assembler-not "str\[\\t \]*r.,\[\\t \]*.sp," } } */ + +struct A { + int f1; + int f2; +}; + +int func(int c); + +/* This function should not need to spill anything to the stack. */ +int test(struct A* src, struct A* dst, int count) +{ + while (count--) { + if (!func(src->f2)) { + return 0; + } + *dst++ = *src++; + } + + return 1; +} === modified file 'gcc/tree-inline.c' --- old/gcc/tree-inline.c 2010-03-18 20:07:13 +0000 +++ new/gcc/tree-inline.c 2010-08-02 13:51:23 +0000 @@ -3246,34 +3246,13 @@ if (POINTER_TYPE_P (funtype)) funtype = TREE_TYPE (funtype); - if (decl && DECL_BUILT_IN_CLASS (decl) == BUILT_IN_MD) + if (is_simple_builtin (decl)) + return 0; + else if (is_inexpensive_builtin (decl)) cost = weights->target_builtin_call_cost; else cost = weights->call_cost; - if (decl && DECL_BUILT_IN_CLASS (decl) == BUILT_IN_NORMAL) - switch (DECL_FUNCTION_CODE (decl)) - { - case BUILT_IN_CONSTANT_P: - return 0; - case BUILT_IN_EXPECT: - return 0; - - /* Prefetch instruction is not expensive. */ - case BUILT_IN_PREFETCH: - cost = weights->target_builtin_call_cost; - break; - - /* Exception state returns or moves registers around. */ - case BUILT_IN_EH_FILTER: - case BUILT_IN_EH_POINTER: - case BUILT_IN_EH_COPY_VALUES: - return 0; - - default: - break; - } - if (decl) funtype = TREE_TYPE (decl); === modified file 'gcc/tree-ssa-loop-ivopts.c' --- old/gcc/tree-ssa-loop-ivopts.c 2010-04-01 15:18:07 +0000 +++ new/gcc/tree-ssa-loop-ivopts.c 2010-08-02 13:51:23 +0000 @@ -257,6 +257,9 @@ /* Are we optimizing for speed? */ bool speed; + + /* Whether the loop body includes any function calls. */ + bool body_includes_call; }; /* An assignment of iv candidates to uses. */ @@ -2926,6 +2929,20 @@ return get_computation_at (loop, use, cand, use->stmt); } +/* Adjust the cost COST for being in loop setup rather than loop body. + If we're optimizing for space, the loop setup overhead is constant; + if we're optimizing for speed, amortize it over the per-iteration cost. */ +static unsigned +adjust_setup_cost (struct ivopts_data *data, unsigned cost) +{ + if (cost == INFTY) + return cost; + else if (optimize_loop_for_speed_p (data->current_loop)) + return cost / AVG_LOOP_NITER (data->current_loop); + else + return cost; +} + /* Returns cost of addition in MODE. */ static unsigned @@ -3838,8 +3855,8 @@ /* Symbol + offset should be compile-time computable so consider that they are added once to the variable, if present. */ if (var_present && (symbol_present || offset)) - cost.cost += add_cost (TYPE_MODE (ctype), speed) - / AVG_LOOP_NITER (data->current_loop); + cost.cost += adjust_setup_cost (data, + add_cost (TYPE_MODE (ctype), speed)); /* Having offset does not affect runtime cost in case it is added to symbol, but it increases complexity. */ @@ -4104,7 +4121,7 @@ elim_cost = force_var_cost (data, bound, &depends_on_elim); /* The bound is a loop invariant, so it will be only computed once. */ - elim_cost.cost /= AVG_LOOP_NITER (data->current_loop); + elim_cost.cost = adjust_setup_cost (data, elim_cost.cost); } else elim_cost = infinite_cost; @@ -4351,7 +4368,7 @@ cost_base = force_var_cost (data, base, NULL); cost_step = add_cost (TYPE_MODE (TREE_TYPE (base)), data->speed); - cost = cost_step + cost_base.cost / AVG_LOOP_NITER (current_loop); + cost = cost_step + adjust_setup_cost (data, cost_base.cost); /* Prefer the original ivs unless we may gain something by replacing it. The reason is to make debugging simpler; so this is not relevant for @@ -4404,7 +4421,8 @@ { /* We add size to the cost, so that we prefer eliminating ivs if possible. */ - return size + estimate_reg_pressure_cost (size, data->regs_used, data->speed); + return size + estimate_reg_pressure_cost (size, data->regs_used, data->speed, + data->body_includes_call); } /* For each size of the induction variable set determine the penalty. */ @@ -4419,30 +4437,11 @@ struct loop *loop = data->current_loop; bitmap_iterator bi; - /* We use the following model (definitely improvable, especially the - cost function -- TODO): - - We estimate the number of registers available (using MD data), name it A. - - We estimate the number of registers used by the loop, name it U. This - number is obtained as the number of loop phi nodes (not counting virtual - registers and bivs) + the number of variables from outside of the loop. - - We set a reserve R (free regs that are used for temporary computations, - etc.). For now the reserve is a constant 3. - - Let I be the number of induction variables. - - -- if U + I + R <= A, the cost is I * SMALL_COST (just not to encourage - make a lot of ivs without a reason). - -- if A - R < U + I <= A, the cost is I * PRES_COST - -- if U + I > A, the cost is I * PRES_COST and - number of uses * SPILL_COST * (U + I - A) / (U + I) is added. */ - if (dump_file && (dump_flags & TDF_DETAILS)) { fprintf (dump_file, "Global costs:\n"); fprintf (dump_file, " target_avail_regs %d\n", target_avail_regs); + fprintf (dump_file, " target_clobbered_regs %d\n", target_clobbered_regs); fprintf (dump_file, " target_reg_cost %d\n", target_reg_cost[data->speed]); fprintf (dump_file, " target_spill_cost %d\n", target_spill_cost[data->speed]); } @@ -5062,11 +5061,13 @@ } /* Tries to extend the sets IVS in the best possible way in order - to express the USE. */ + to express the USE. If ORIGINALP is true, prefer candidates from + the original set of IVs, otherwise favor important candidates not + based on any memory object. */ static bool try_add_cand_for (struct ivopts_data *data, struct iv_ca *ivs, - struct iv_use *use) + struct iv_use *use, bool originalp) { comp_cost best_cost, act_cost; unsigned i; @@ -5085,7 +5086,8 @@ iv_ca_set_no_cp (data, ivs, use); } - /* First try important candidates not based on any memory object. Only if + /* If ORIGINALP is true, try to find the original IV for the use. Otherwise + first try important candidates not based on any memory object. Only if this fails, try the specific ones. Rationale -- in loops with many variables the best choice often is to use just one generic biv. If we added here many ivs specific to the uses, the optimization algorithm later @@ -5097,7 +5099,10 @@ { cand = iv_cand (data, i); - if (cand->iv->base_object != NULL_TREE) + if (originalp && cand->pos !=IP_ORIGINAL) + continue; + + if (!originalp && cand->iv->base_object != NULL_TREE) continue; if (iv_ca_cand_used_p (ivs, cand)) @@ -5133,8 +5138,13 @@ continue; /* Already tried this. */ - if (cand->important && cand->iv->base_object == NULL_TREE) - continue; + if (cand->important) + { + if (originalp && cand->pos == IP_ORIGINAL) + continue; + if (!originalp && cand->iv->base_object == NULL_TREE) + continue; + } if (iv_ca_cand_used_p (ivs, cand)) continue; @@ -5168,13 +5178,13 @@ /* Finds an initial assignment of candidates to uses. */ static struct iv_ca * -get_initial_solution (struct ivopts_data *data) +get_initial_solution (struct ivopts_data *data, bool originalp) { struct iv_ca *ivs = iv_ca_new (data); unsigned i; for (i = 0; i < n_iv_uses (data); i++) - if (!try_add_cand_for (data, ivs, iv_use (data, i))) + if (!try_add_cand_for (data, ivs, iv_use (data, i), originalp)) { iv_ca_free (&ivs); return NULL; @@ -5246,14 +5256,12 @@ solution and remove the unused ivs while this improves the cost. */ static struct iv_ca * -find_optimal_iv_set (struct ivopts_data *data) +find_optimal_iv_set_1 (struct ivopts_data *data, bool originalp) { - unsigned i; struct iv_ca *set; - struct iv_use *use; /* Get the initial solution. */ - set = get_initial_solution (data); + set = get_initial_solution (data, originalp); if (!set) { if (dump_file && (dump_flags & TDF_DETAILS)) @@ -5276,11 +5284,46 @@ } } + return set; +} + +static struct iv_ca * +find_optimal_iv_set (struct ivopts_data *data) +{ + unsigned i; + struct iv_ca *set, *origset; + struct iv_use *use; + comp_cost cost, origcost; + + /* Determine the cost based on a strategy that starts with original IVs, + and try again using a strategy that prefers candidates not based + on any IVs. */ + origset = find_optimal_iv_set_1 (data, true); + set = find_optimal_iv_set_1 (data, false); + + if (!origset && !set) + return NULL; + + origcost = origset ? iv_ca_cost (origset) : infinite_cost; + cost = set ? iv_ca_cost (set) : infinite_cost; + if (dump_file && (dump_flags & TDF_DETAILS)) { - comp_cost cost = iv_ca_cost (set); - fprintf (dump_file, "Final cost %d (complexity %d)\n\n", cost.cost, cost.complexity); - } + fprintf (dump_file, "Original cost %d (complexity %d)\n\n", + origcost.cost, origcost.complexity); + fprintf (dump_file, "Final cost %d (complexity %d)\n\n", + cost.cost, cost.complexity); + } + + /* Choose the one with the best cost. */ + if (compare_costs (origcost, cost) <= 0) + { + if (set) + iv_ca_free (&set); + set = origset; + } + else if (origset) + iv_ca_free (&origset); for (i = 0; i < n_iv_uses (data); i++) { @@ -5768,6 +5811,25 @@ VEC_free (iv_cand_p, heap, data->iv_candidates); } +/* Returns true if the loop body BODY includes any function calls. */ + +static bool +loop_body_includes_call (basic_block *body, unsigned num_nodes) +{ + gimple_stmt_iterator gsi; + unsigned i; + + for (i = 0; i < num_nodes; i++) + for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next (&gsi)) + { + gimple stmt = gsi_stmt (gsi); + if (is_gimple_call (stmt) + && !is_inexpensive_builtin (gimple_call_fndecl (stmt))) + return true; + } + return false; +} + /* Optimizes the LOOP. Returns true if anything changed. */ static bool @@ -5799,6 +5861,7 @@ } body = get_loop_body (loop); + data->body_includes_call = loop_body_includes_call (body, loop->num_nodes); renumber_gimple_stmt_uids_in_blocks (body, loop->num_nodes); free (body); === modified file 'gcc/tree.h' --- old/gcc/tree.h 2010-04-02 18:54:46 +0000 +++ new/gcc/tree.h 2010-08-02 13:51:23 +0000 @@ -4962,6 +4962,8 @@ extern bool merge_ranges (int *, tree *, tree *, int, tree, tree, int, tree, tree); extern void set_builtin_user_assembler_name (tree decl, const char *asmspec); +extern bool is_simple_builtin (tree); +extern bool is_inexpensive_builtin (tree); /* In convert.c */ extern tree strip_float_extensions (tree);