aboutsummaryrefslogtreecommitdiffstats
path: root/toolchain-layer/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch
diff options
context:
space:
mode:
Diffstat (limited to 'toolchain-layer/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch')
-rw-r--r--toolchain-layer/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch784
1 files changed, 784 insertions, 0 deletions
diff --git a/toolchain-layer/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch b/toolchain-layer/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch
new file mode 100644
index 0000000000..bb866ce8d9
--- /dev/null
+++ b/toolchain-layer/recipes-devtools/gcc/gcc-4.5/linaro/gcc-4.5-linaro-r99495.patch
@@ -0,0 +1,784 @@
+2011-03-24 Revital Eres <revital.eres@linaro.org>
+
+ gcc/
+ * loop-doloop.c (doloop_condition_get): Support new form of
+ doloop pattern and use prev_nondebug_insn instead of PREV_INSN.
+ * config/arm/thumb2.md (*thumb2_addsi3_compare0): Remove "*".
+ (doloop_end): New.
+ * config/arm/arm.md (*addsi3_compare0): Remove "*".
+ * ddg.c (check_closing_branch_deps, get_node_of_insn_uid):
+ New functions.
+ (create_ddg): Pass sbitmap containing do-loop related
+ instructions instead of closing_branch_deps parameter and call
+ check_closing_branch_deps function.
+ * ddg.h (create_ddg): Adjust the function declaration.
+ * modulo-sched.c (PS_STAGE_COUNT): Rename to CALC_STAGE_COUNT
+ and redefine.
+ (doloop_register_get): Handle NONDEBUG_INSN_P.
+ (stage_count): New field in struct partial_schedule.
+ (mark_doloop_insns, calculate_stage_count): New functions.
+ (normalize_sched_times): Rename to reset_sched_times and handle
+ incrementing the sched time of the nodes by a constant value
+ passed as parameter.
+ (duplicate_insns_of_cycles): Skip closing branch.
+ (sms_schedule_by_order): Schedule closing branch when
+ closing_branch_deps is true.
+ (ps_insn_find_column): Handle closing branch.
+ (sms_schedule): Call reset_sched_times and handle case where
+ do-loop pattern is not decoupled from the other loop instructions.
+ Support new form of doloop pattern.
+ (ps_insert_empty_row): Update calls to normalize_sched_times
+ and rotate_partial_schedule functions.
+
+=== modified file 'gcc/config/arm/arm.md'
+--- old/gcc/config/arm/arm.md 2011-03-11 14:26:34 +0000
++++ new/gcc/config/arm/arm.md 2011-03-24 07:45:38 +0000
+@@ -734,7 +734,7 @@
+ ""
+ )
+
+-(define_insn "*addsi3_compare0"
++(define_insn "addsi3_compare0"
+ [(set (reg:CC_NOOV CC_REGNUM)
+ (compare:CC_NOOV
+ (plus:SI (match_operand:SI 1 "s_register_operand" "r, r")
+
+=== modified file 'gcc/config/arm/thumb2.md'
+--- old/gcc/config/arm/thumb2.md 2011-02-08 10:51:58 +0000
++++ new/gcc/config/arm/thumb2.md 2011-03-24 07:45:38 +0000
+@@ -1194,7 +1194,7 @@
+ (set_attr "length" "2")]
+ )
+
+-(define_insn "*thumb2_addsi3_compare0"
++(define_insn "thumb2_addsi3_compare0"
+ [(set (reg:CC_NOOV CC_REGNUM)
+ (compare:CC_NOOV
+ (plus:SI (match_operand:SI 1 "s_register_operand" "l, 0, r")
+@@ -1445,3 +1445,56 @@
+ [(set_attr "length" "4,4,16")
+ (set_attr "predicable" "yes")]
+ )
++
++
++;; Define the subtract-one-and-jump insns so loop.c
++;; knows what to generate.
++(define_expand "doloop_end"
++ [(use (match_operand 0 "" "")) ; loop pseudo
++ (use (match_operand 1 "" "")) ; iterations; zero if unknown
++ (use (match_operand 2 "" "")) ; max iterations
++ (use (match_operand 3 "" "")) ; loop level
++ (use (match_operand 4 "" ""))] ; label
++ "TARGET_32BIT"
++ "
++ {
++ /* Currently SMS relies on the do-loop pattern to recognize loops
++ where (1) the control part consists of all insns defining and/or
++ using a certain 'count' register and (2) the loop count can be
++ adjusted by modifying this register prior to the loop.
++ ??? The possible introduction of a new block to initialize the
++ new IV can potentially affect branch optimizations. */
++ if (optimize > 0 && flag_modulo_sched)
++ {
++ rtx s0;
++ rtx bcomp;
++ rtx loc_ref;
++ rtx cc_reg;
++ rtx insn;
++ rtx cmp;
++
++ /* Only use this on innermost loops. */
++ if (INTVAL (operands[3]) > 1)
++ FAIL;
++
++ if (GET_MODE (operands[0]) != SImode)
++ FAIL;
++
++ s0 = operands [0];
++ if (TARGET_THUMB2)
++ insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
++ else
++ insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
++
++ cmp = XVECEXP (PATTERN (insn), 0, 0);
++ cc_reg = SET_DEST (cmp);
++ bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
++ loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [4]);
++ emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx,
++ gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
++ loc_ref, pc_rtx)));
++ DONE;
++ }else
++ FAIL;
++ }")
++
+
+=== modified file 'gcc/ddg.c'
+--- old/gcc/ddg.c 2010-07-19 08:58:53 +0000
++++ new/gcc/ddg.c 2011-03-24 07:45:38 +0000
+@@ -60,6 +60,8 @@
+ static ddg_edge_ptr create_ddg_edge (ddg_node_ptr, ddg_node_ptr, dep_type,
+ dep_data_type, int, int);
+ static void add_edge_to_ddg (ddg_ptr g, ddg_edge_ptr);
++static ddg_node_ptr get_node_of_insn_uid (ddg_ptr, int);
++
+
+ /* Auxiliary variable for mem_read_insn_p/mem_write_insn_p. */
+ static bool mem_ref_p;
+@@ -450,12 +452,65 @@
+ sched_free_deps (head, tail, false);
+ }
+
++/* Given DOLOOP_INSNS which holds the instructions that
++ belong to the do-loop part; mark closing_branch_deps field in ddg G
++ as TRUE if the do-loop part's instructions are dependent on the other
++ loop instructions. Otherwise mark it as FALSE. */
++static void
++check_closing_branch_deps (ddg_ptr g, sbitmap doloop_insns)
++{
++ sbitmap_iterator sbi;
++ unsigned int u = 0;
++
++ EXECUTE_IF_SET_IN_SBITMAP (doloop_insns, 0, u, sbi)
++ {
++ ddg_edge_ptr e;
++ ddg_node_ptr u_node = get_node_of_insn_uid (g, u);
++
++ gcc_assert (u_node);
++
++ for (e = u_node->in; e != 0; e = e->next_in)
++ {
++ ddg_node_ptr v_node = e->src;
++
++ if (((unsigned int) INSN_UID (v_node->insn) == u)
++ || DEBUG_INSN_P (v_node->insn))
++ continue;
++
++ /* Ignore dependencies between memory writes and the
++ jump. */
++ if (JUMP_P (u_node->insn)
++ && e->type == OUTPUT_DEP
++ && mem_write_insn_p (v_node->insn))
++ continue;
++ if (!TEST_BIT (doloop_insns, INSN_UID (v_node->insn)))
++ {
++ g->closing_branch_deps = 1;
++ return;
++ }
++ }
++ for (e = u_node->out; e != 0; e = e->next_out)
++ {
++ ddg_node_ptr v_node = e->dest;
++
++ if (((unsigned int) INSN_UID (v_node->insn) == u)
++ || DEBUG_INSN_P (v_node->insn))
++ continue;
++ if (!TEST_BIT (doloop_insns, INSN_UID (v_node->insn)))
++ {
++ g->closing_branch_deps = 1;
++ return;
++ }
++ }
++ }
++ g->closing_branch_deps = 0;
++}
+
+ /* Given a basic block, create its DDG and return a pointer to a variable
+ of ddg type that represents it.
+ Initialize the ddg structure fields to the appropriate values. */
+ ddg_ptr
+-create_ddg (basic_block bb, int closing_branch_deps)
++create_ddg (basic_block bb, sbitmap doloop_insns)
+ {
+ ddg_ptr g;
+ rtx insn, first_note;
+@@ -465,7 +520,6 @@
+ g = (ddg_ptr) xcalloc (1, sizeof (struct ddg));
+
+ g->bb = bb;
+- g->closing_branch_deps = closing_branch_deps;
+
+ /* Count the number of insns in the BB. */
+ for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
+@@ -538,6 +592,11 @@
+ /* Build the data dependency graph. */
+ build_intra_loop_deps (g);
+ build_inter_loop_deps (g);
++
++ /* Check whether the do-loop part is decoupled from the other loop
++ instructions. */
++ check_closing_branch_deps (g, doloop_insns);
++
+ return g;
+ }
+
+@@ -831,6 +890,18 @@
+ return NULL;
+ }
+
++/* Given the uid of an instruction UID return the node that represents it. */
++static ddg_node_ptr
++get_node_of_insn_uid (ddg_ptr g, int uid)
++{
++ int i;
++
++ for (i = 0; i < g->num_nodes; i++)
++ if (uid == INSN_UID (g->nodes[i].insn))
++ return &g->nodes[i];
++ return NULL;
++}
++
+ /* Given a set OPS of nodes in the DDG, find the set of their successors
+ which are not in OPS, and set their bits in SUCC. Bits corresponding to
+ OPS are cleared from SUCC. Leaves the other bits in SUCC unchanged. */
+
+=== modified file 'gcc/ddg.h'
+--- old/gcc/ddg.h 2009-11-25 10:55:54 +0000
++++ new/gcc/ddg.h 2011-03-24 07:45:38 +0000
+@@ -167,7 +167,7 @@
+ };
+
+
+-ddg_ptr create_ddg (basic_block, int closing_branch_deps);
++ddg_ptr create_ddg (basic_block, sbitmap);
+ void free_ddg (ddg_ptr);
+
+ void print_ddg (FILE *, ddg_ptr);
+
+=== modified file 'gcc/loop-doloop.c'
+--- old/gcc/loop-doloop.c 2010-07-19 08:58:53 +0000
++++ new/gcc/loop-doloop.c 2011-03-24 07:45:38 +0000
+@@ -78,6 +78,8 @@
+ rtx inc_src;
+ rtx condition;
+ rtx pattern;
++ rtx cc_reg = NULL_RTX;
++ rtx reg_orig = NULL_RTX;
+
+ /* The canonical doloop pattern we expect has one of the following
+ forms:
+@@ -96,7 +98,16 @@
+ 2) (set (reg) (plus (reg) (const_int -1))
+ (set (pc) (if_then_else (reg != 0)
+ (label_ref (label))
+- (pc))). */
++ (pc))).
++
++ Some targets (ARM) do the comparison before the branch, as in the
++ following form:
++
++ 3) (parallel [(set (cc) (compare ((plus (reg) (const_int -1), 0)))
++ (set (reg) (plus (reg) (const_int -1)))])
++ (set (pc) (if_then_else (cc == NE)
++ (label_ref (label))
++ (pc))) */
+
+ pattern = PATTERN (doloop_pat);
+
+@@ -104,19 +115,47 @@
+ {
+ rtx cond;
+ rtx prev_insn = prev_nondebug_insn (doloop_pat);
++ rtx cmp_arg1, cmp_arg2;
++ rtx cmp_orig;
+
+- /* We expect the decrement to immediately precede the branch. */
++ /* In case the pattern is not PARALLEL we expect two forms
++ of doloop which are cases 2) and 3) above: in case 2) the
++ decrement immediately precedes the branch, while in case 3)
++ the compare and decrement instructions immediately precede
++ the branch. */
+
+ if (prev_insn == NULL_RTX || !INSN_P (prev_insn))
+ return 0;
+
+ cmp = pattern;
+- inc = PATTERN (PREV_INSN (doloop_pat));
++ if (GET_CODE (PATTERN (prev_insn)) == PARALLEL)
++ {
++ /* The third case: the compare and decrement instructions
++ immediately precede the branch. */
++ cmp_orig = XVECEXP (PATTERN (prev_insn), 0, 0);
++ if (GET_CODE (cmp_orig) != SET)
++ return 0;
++ if (GET_CODE (SET_SRC (cmp_orig)) != COMPARE)
++ return 0;
++ cmp_arg1 = XEXP (SET_SRC (cmp_orig), 0);
++ cmp_arg2 = XEXP (SET_SRC (cmp_orig), 1);
++ if (cmp_arg2 != const0_rtx
++ || GET_CODE (cmp_arg1) != PLUS)
++ return 0;
++ reg_orig = XEXP (cmp_arg1, 0);
++ if (XEXP (cmp_arg1, 1) != GEN_INT (-1)
++ || !REG_P (reg_orig))
++ return 0;
++ cc_reg = SET_DEST (cmp_orig);
++
++ inc = XVECEXP (PATTERN (prev_insn), 0, 1);
++ }
++ else
++ inc = PATTERN (prev_insn);
+ /* We expect the condition to be of the form (reg != 0) */
+ cond = XEXP (SET_SRC (cmp), 0);
+ if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
+ return 0;
+-
+ }
+ else
+ {
+@@ -162,11 +201,15 @@
+ return 0;
+
+ if ((XEXP (condition, 0) == reg)
++ /* For the third case: */
++ || ((cc_reg != NULL_RTX)
++ && (XEXP (condition, 0) == cc_reg)
++ && (reg_orig == reg))
+ || (GET_CODE (XEXP (condition, 0)) == PLUS
+- && XEXP (XEXP (condition, 0), 0) == reg))
++ && XEXP (XEXP (condition, 0), 0) == reg))
+ {
+ if (GET_CODE (pattern) != PARALLEL)
+- /* The second form we expect:
++ /* For the second form we expect:
+
+ (set (reg) (plus (reg) (const_int -1))
+ (set (pc) (if_then_else (reg != 0)
+@@ -181,7 +224,24 @@
+ (set (reg) (plus (reg) (const_int -1)))
+ (additional clobbers and uses)])
+
+- So we return that form instead.
++ For the third form we expect:
++
++ (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
++ (set (reg) (plus (reg) (const_int -1)))])
++ (set (pc) (if_then_else (cc == NE)
++ (label_ref (label))
++ (pc)))
++
++ which is equivalent to the following:
++
++ (parallel [(set (cc) (compare (reg, 1))
++ (set (reg) (plus (reg) (const_int -1)))
++ (set (pc) (if_then_else (NE == cc)
++ (label_ref (label))
++ (pc))))])
++
++ So we return the second form instead for the two cases.
++
+ */
+ condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
+
+
+=== modified file 'gcc/modulo-sched.c'
+--- old/gcc/modulo-sched.c 2009-11-25 10:55:54 +0000
++++ new/gcc/modulo-sched.c 2011-03-24 07:45:38 +0000
+@@ -116,8 +116,10 @@
+
+ /* The number of different iterations the nodes in ps span, assuming
+ the stage boundaries are placed efficiently. */
+-#define PS_STAGE_COUNT(ps) ((PS_MAX_CYCLE (ps) - PS_MIN_CYCLE (ps) \
+- + 1 + (ps)->ii - 1) / (ps)->ii)
++#define CALC_STAGE_COUNT(min_cycle,max_cycle,ii) ((max_cycle - min_cycle \
++ + 1 + ii - 1) / ii)
++/* The stage count of ps. */
++#define PS_STAGE_COUNT(ps) (((partial_schedule_ptr)(ps))->stage_count)
+
+ /* A single instruction in the partial schedule. */
+ struct ps_insn
+@@ -155,6 +157,8 @@
+ int max_cycle;
+
+ ddg_ptr g; /* The DDG of the insns in the partial schedule. */
++
++ int stage_count; /* The stage count of the partial schedule. */
+ };
+
+ /* We use this to record all the register replacements we do in
+@@ -195,6 +199,7 @@
+ rtx, rtx);
+ static void duplicate_insns_of_cycles (partial_schedule_ptr,
+ int, int, int, rtx);
++static int calculate_stage_count (partial_schedule_ptr ps);
+
+ #define SCHED_ASAP(x) (((node_sched_params_ptr)(x)->aux.info)->asap)
+ #define SCHED_TIME(x) (((node_sched_params_ptr)(x)->aux.info)->time)
+@@ -310,10 +315,10 @@
+ either a single (parallel) branch-on-count or a (non-parallel)
+ branch immediately preceded by a single (decrement) insn. */
+ first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
+- : PREV_INSN (tail));
++ : prev_nondebug_insn (tail));
+
+ for (insn = head; insn != first_insn_not_to_check; insn = NEXT_INSN (insn))
+- if (reg_mentioned_p (reg, insn))
++ if (reg_mentioned_p (reg, insn) && NONDEBUG_INSN_P (insn))
+ {
+ if (dump_file)
+ {
+@@ -332,6 +337,24 @@
+ #endif
+ }
+
++/* Mark in DOLOOP_INSNS the instructions that belong to the do-loop part.
++ Use TAIL to recognize that part. */
++static void
++mark_doloop_insns (sbitmap doloop_insns, rtx tail)
++{
++ rtx first_insn_not_to_check, insn;
++
++ /* This is the first instruction which belongs the doloop part. */
++ first_insn_not_to_check = (GET_CODE (PATTERN (tail)) == PARALLEL ? tail
++ : prev_nondebug_insn (tail));
++
++ sbitmap_zero (doloop_insns);
++ for (insn = first_insn_not_to_check; insn != NEXT_INSN (tail);
++ insn = NEXT_INSN (insn))
++ if (NONDEBUG_INSN_P (insn))
++ SET_BIT (doloop_insns, INSN_UID (insn));
++}
++
+ /* Check if COUNT_REG is set to a constant in the PRE_HEADER block, so
+ that the number of iterations is a compile-time constant. If so,
+ return the rtx that sets COUNT_REG to a constant, and set COUNT to
+@@ -569,13 +592,12 @@
+ }
+ }
+
+-/* Bump the SCHED_TIMEs of all nodes to start from zero. Set the values
+- of SCHED_ROW and SCHED_STAGE. */
++/* Bump the SCHED_TIMEs of all nodes by AMOUNT. Set the values of
++ SCHED_ROW and SCHED_STAGE. */
+ static void
+-normalize_sched_times (partial_schedule_ptr ps)
++reset_sched_times (partial_schedule_ptr ps, int amount)
+ {
+ int row;
+- int amount = PS_MIN_CYCLE (ps);
+ int ii = ps->ii;
+ ps_insn_ptr crr_insn;
+
+@@ -584,6 +606,10 @@
+ {
+ ddg_node_ptr u = crr_insn->node;
+ int normalized_time = SCHED_TIME (u) - amount;
++ int new_min_cycle = PS_MIN_CYCLE (ps) - amount;
++ /* The first cycle in row zero after the rotation. */
++ int new_first_cycle_in_row_zero =
++ new_min_cycle + ii - SMODULO (new_min_cycle, ii);
+
+ if (dump_file)
+ fprintf (dump_file, "crr_insn->node=%d, crr_insn->cycle=%d,\
+@@ -592,8 +618,30 @@
+ gcc_assert (SCHED_TIME (u) >= ps->min_cycle);
+ gcc_assert (SCHED_TIME (u) <= ps->max_cycle);
+ SCHED_TIME (u) = normalized_time;
+- SCHED_ROW (u) = normalized_time % ii;
+- SCHED_STAGE (u) = normalized_time / ii;
++ crr_insn->cycle = normalized_time;
++ SCHED_ROW (u) = SMODULO (normalized_time, ii);
++
++ /* If min_cycle is in row zero after the rotation then
++ the stage count can be calculated by dividing the cycle
++ with ii. Otherwise, the calculation is done by dividing the
++ SMSed kernel into two intervals:
++
++ 1) min_cycle <= interval 0 < first_cycle_in_row_zero
++ 2) first_cycle_in_row_zero <= interval 1 < max_cycle
++
++ Cycles in interval 0 are in stage 0. The stage of cycles
++ in interval 1 should be added by 1 to take interval 0 into
++ account. */
++ if (SMODULO (new_min_cycle, ii) == 0)
++ SCHED_STAGE (u) = normalized_time / ii;
++ else
++ {
++ if (crr_insn->cycle < new_first_cycle_in_row_zero)
++ SCHED_STAGE (u) = 0;
++ else
++ SCHED_STAGE (u) =
++ ((SCHED_TIME (u) - new_first_cycle_in_row_zero) / ii) + 1;
++ }
+ }
+ }
+
+@@ -646,9 +694,12 @@
+
+ /* Do not duplicate any insn which refers to count_reg as it
+ belongs to the control part.
++ If closing_branch_deps is true the closing branch is scheduled
++ as well and thus should be ignored.
+ TODO: This should be done by analyzing the control part of
+ the loop. */
+- if (reg_mentioned_p (count_reg, u_node->insn))
++ if (reg_mentioned_p (count_reg, u_node->insn)
++ || JUMP_P (ps_ij->node->insn))
+ continue;
+
+ if (for_prolog)
+@@ -894,7 +945,8 @@
+ basic_block condition_bb = NULL;
+ edge latch_edge;
+ gcov_type trip_count = 0;
+-
++ sbitmap doloop_insns;
++
+ loop_optimizer_init (LOOPS_HAVE_PREHEADERS
+ | LOOPS_HAVE_RECORDED_EXITS);
+ if (number_of_loops () <= 1)
+@@ -919,6 +971,7 @@
+ setup_sched_infos ();
+ haifa_sched_init ();
+
++ doloop_insns = sbitmap_alloc (get_max_uid () + 1);
+ /* Allocate memory to hold the DDG array one entry for each loop.
+ We use loop->num as index into this array. */
+ g_arr = XCNEWVEC (ddg_ptr, number_of_loops ());
+@@ -1009,9 +1062,11 @@
+ continue;
+ }
+
+- /* Don't handle BBs with calls or barriers, or !single_set insns,
+- or auto-increment insns (to avoid creating invalid reg-moves
+- for the auto-increment insns).
++ /* Don't handle BBs with calls or barriers or auto-increment insns
++ (to avoid creating invalid reg-moves for the auto-increment insns),
++ or !single_set with the exception of instructions that include
++ count_reg---these instructions are part of the control part
++ that do-loop recognizes.
+ ??? Should handle auto-increment insns.
+ ??? Should handle insns defining subregs. */
+ for (insn = head; insn != NEXT_INSN (tail); insn = NEXT_INSN (insn))
+@@ -1021,7 +1076,8 @@
+ if (CALL_P (insn)
+ || BARRIER_P (insn)
+ || (NONDEBUG_INSN_P (insn) && !JUMP_P (insn)
+- && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE)
++ && !single_set (insn) && GET_CODE (PATTERN (insn)) != USE
++ && !reg_mentioned_p (count_reg, insn))
+ || (FIND_REG_INC_NOTE (insn, NULL_RTX) != 0)
+ || (INSN_P (insn) && (set = single_set (insn))
+ && GET_CODE (SET_DEST (set)) == SUBREG))
+@@ -1048,14 +1104,16 @@
+
+ continue;
+ }
+-
+- if (! (g = create_ddg (bb, 0)))
++ mark_doloop_insns (doloop_insns, tail);
++ if (! (g = create_ddg (bb, doloop_insns)))
+ {
+ if (dump_file)
+ fprintf (dump_file, "SMS create_ddg failed\n");
+ continue;
+ }
+-
++ if (dump_file)
++ fprintf (dump_file, "SMS closing_branch_deps: %d\n",
++ g->closing_branch_deps);
+ g_arr[loop->num] = g;
+ if (dump_file)
+ fprintf (dump_file, "...OK\n");
+@@ -1157,11 +1215,13 @@
+
+ ps = sms_schedule_by_order (g, mii, maxii, node_order);
+
+- if (ps){
+- stage_count = PS_STAGE_COUNT (ps);
+- gcc_assert(stage_count >= 1);
+- }
+-
++ if (ps)
++ {
++ stage_count = calculate_stage_count (ps);
++ gcc_assert(stage_count >= 1);
++ PS_STAGE_COUNT(ps) = stage_count;
++ }
++
+ /* Stage count of 1 means that there is no interleaving between
+ iterations, let the scheduling passes do the job. */
+ if (stage_count <= 1
+@@ -1182,17 +1242,7 @@
+ else
+ {
+ struct undo_replace_buff_elem *reg_move_replaces;
+-
+- if (dump_file)
+- {
+- fprintf (dump_file,
+- "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
+- stage_count);
+- print_partial_schedule (ps, dump_file);
+- fprintf (dump_file,
+- "SMS Branch (%d) will later be scheduled at cycle %d.\n",
+- g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
+- }
++ int amount;
+
+ /* Set the stage boundaries. If the DDG is built with closing_branch_deps,
+ the closing_branch was scheduled and should appear in the last (ii-1)
+@@ -1202,12 +1252,28 @@
+ TODO: Revisit the issue of scheduling the insns of the
+ control part relative to the branch when the control part
+ has more than one insn. */
+- normalize_sched_times (ps);
+- rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
++ amount = (g->closing_branch_deps)? SCHED_TIME (g->closing_branch) + 1:
++ PS_MIN_CYCLE (ps);
++ reset_sched_times (ps, amount);
++ rotate_partial_schedule (ps, amount);
++
+ set_columns_for_ps (ps);
+
+ canon_loop (loop);
+
++ if (dump_file)
++ {
++ fprintf (dump_file,
++ "SMS succeeded %d %d (with ii, sc)\n", ps->ii,
++ stage_count);
++ print_partial_schedule (ps, dump_file);
++ if (!g->closing_branch_deps)
++ fprintf (dump_file,
++ "SMS Branch (%d) will later be scheduled at \
++ cycle %d.\n",
++ g->closing_branch->cuid, PS_MIN_CYCLE (ps) - 1);
++ }
++
+ /* case the BCT count is not known , Do loop-versioning */
+ if (count_reg && ! count_init)
+ {
+@@ -1252,6 +1318,7 @@
+ }
+
+ free (g_arr);
++ sbitmap_free (doloop_insns);
+
+ /* Release scheduler data, needed until now because of DFA. */
+ haifa_sched_finish ();
+@@ -1759,8 +1826,9 @@
+ RESET_BIT (tobe_scheduled, u);
+ continue;
+ }
+-
+- if (JUMP_P (insn)) /* Closing branch handled later. */
++ /* Closing branch handled later unless closing_branch_deps
++ is true. */
++ if (JUMP_P (insn) && !g->closing_branch_deps)
+ {
+ RESET_BIT (tobe_scheduled, u);
+ continue;
+@@ -1893,8 +1961,8 @@
+ if (dump_file)
+ fprintf (dump_file, "split_row=%d\n", split_row);
+
+- normalize_sched_times (ps);
+- rotate_partial_schedule (ps, ps->min_cycle);
++ reset_sched_times (ps, PS_MIN_CYCLE (ps));
++ rotate_partial_schedule (ps, PS_MIN_CYCLE (ps));
+
+ rows_new = (ps_insn_ptr *) xcalloc (new_ii, sizeof (ps_insn_ptr));
+ for (row = 0; row < split_row; row++)
+@@ -2571,6 +2639,7 @@
+ ps_insn_ptr next_ps_i;
+ ps_insn_ptr first_must_follow = NULL;
+ ps_insn_ptr last_must_precede = NULL;
++ ps_insn_ptr last_in_row = NULL;
+ int row;
+
+ if (! ps_i)
+@@ -2597,8 +2666,37 @@
+ else
+ last_must_precede = next_ps_i;
+ }
++ /* The closing branch must be the last in the row. */
++ if (must_precede
++ && TEST_BIT (must_precede, next_ps_i->node->cuid)
++ && JUMP_P (next_ps_i->node->insn))
++ return false;
++
++ last_in_row = next_ps_i;
+ }
+
++ /* If closing_branch_deps is true we are scheduling the closing
++ branch as well. Make sure there is no dependent instruction after
++ it as the branch should be the last instruction. */
++ if (JUMP_P (ps_i->node->insn))
++ {
++ if (first_must_follow)
++ return false;
++ if (last_in_row)
++ {
++ /* Make the branch the last in the row. New instructions
++ will be inserted at the beginning of the row or after the
++ last must_precede instruction thus the branch is guaranteed
++ to remain the last instruction in the row. */
++ last_in_row->next_in_row = ps_i;
++ ps_i->prev_in_row = last_in_row;
++ ps_i->next_in_row = NULL;
++ }
++ else
++ ps->rows[row] = ps_i;
++ return true;
++ }
++
+ /* Now insert the node after INSERT_AFTER_PSI. */
+
+ if (! last_must_precede)
+@@ -2820,6 +2918,54 @@
+ return ps_i;
+ }
+
++/* Calculate the stage count of the partial schedule PS. */
++int
++calculate_stage_count (partial_schedule_ptr ps)
++{
++ int stage_count;
++
++ /* If closing_branch_deps is false then the stage
++ boundaries are placed efficiently, meaning that min_cycle will be
++ placed at row 0. Otherwise, the closing branch will be placed in
++ row ii-1. For the later case we assume the final SMSed kernel can
++ be divided into two intervals. This assumption is used for the
++ stage count calculation:
++
++ 1) min_cycle <= interval 0 < first_cycle_in_row_zero
++ 2) first_cycle_in_row_zero <= interval 1 < max_cycle
++ */
++ stage_count =
++ CALC_STAGE_COUNT (PS_MIN_CYCLE (ps), PS_MAX_CYCLE (ps), ps->ii);
++ if (ps->g->closing_branch_deps)
++ {
++ int new_min_cycle;
++ int new_min_cycle_row;
++ int rotation_amount = SCHED_TIME (ps->g->closing_branch) + 1;
++
++ /* This is the new value of min_cycle after the final rotation to
++ bring closing branch into row ii-1. */
++ new_min_cycle = PS_MIN_CYCLE (ps) - rotation_amount;
++ /* This is the row which the the new min_cycle will be placed in. */
++ new_min_cycle_row = SMODULO (new_min_cycle, ps->ii);
++ /* If the row of min_cycle is zero then interval 0 is empty.
++ Otherwise, we need to calculate interval 1 and add it by one
++ to take interval 0 into account. */
++ if (new_min_cycle_row != 0)
++ {
++ int new_max_cycle, first_cycle_in_row_zero;
++
++ new_max_cycle = PS_MAX_CYCLE (ps) - rotation_amount;
++ first_cycle_in_row_zero =
++ new_min_cycle + ps->ii - new_min_cycle_row;
++
++ stage_count =
++ CALC_STAGE_COUNT (first_cycle_in_row_zero, new_max_cycle,
++ ps->ii) + 1;
++ }
++ }
++ return stage_count;
++}
++
+ /* Rotate the rows of PS such that insns scheduled at time
+ START_CYCLE will appear in row 0. Updates max/min_cycles. */
+ void
+