2011-12-05 Ramana Radhakrishnan Backport from mainline -A15 tuning. 2011-11-30 Matthew Gretton-Dann * config/arm/arm.c (arm_issue_rate): Cortex-A15 can triple issue. * config/arm/arm.md (mul64): New attribute. (generic_sched): Cortex-A15 is not scheduled generically. (cortex-a15.md): Include. * config/arm/cortex-a15.md: New machine description. * config/arm/t-arm (MD_INCLUDES): Add cortex-a15.md. 2011-11-30 Matthew Gretton-Dann * config/arm/t-arm (MD_INCLUDES): Ensure all md files are listed. === modified file 'gcc/config/arm/arm.c' --- old/gcc/config/arm/arm.c 2011-12-05 10:55:48 +0000 +++ new/gcc/config/arm/arm.c 2011-12-05 12:33:25 +0000 @@ -24056,6 +24056,9 @@ { switch (arm_tune) { + case cortexa15: + return 3; + case cortexr4: case cortexr4f: case cortexr5: === modified file 'gcc/config/arm/arm.md' --- old/gcc/config/arm/arm.md 2011-10-26 11:38:30 +0000 +++ new/gcc/config/arm/arm.md 2011-12-02 00:38:59 +0000 @@ -345,6 +345,13 @@ (const_string "mult") (const_string "alu"))) +; Is this an (integer side) multiply with a 64-bit result? +(define_attr "mul64" "no,yes" + (if_then_else + (eq_attr "insn" "smlalxy,umull,umulls,umlal,umlals,smull,smulls,smlal,smlals") + (const_string "yes") + (const_string "no"))) + ; Load scheduling, set from the arm_ld_sched variable ; initialized by arm_option_override() (define_attr "ldsched" "no,yes" (const (symbol_ref "arm_ld_sched"))) @@ -511,7 +518,7 @@ (define_attr "generic_sched" "yes,no" (const (if_then_else - (ior (eq_attr "tune" "fa526,fa626,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexm4") + (ior (eq_attr "tune" "fa526,fa626,fa606te,fa626te,fmp626,fa726te,arm926ejs,arm1020e,arm1026ejs,arm1136js,arm1136jfs,cortexa5,cortexa8,cortexa9,cortexa15,cortexm4") (eq_attr "tune_cortexr4" "yes")) (const_string "no") (const_string "yes")))) @@ -537,6 +544,7 @@ (include "cortex-a5.md") (include "cortex-a8.md") (include "cortex-a9.md") +(include "cortex-a15.md") (include "cortex-r4.md") (include "cortex-r4f.md") (include "cortex-m4.md") === added file 'gcc/config/arm/cortex-a15.md' --- old/gcc/config/arm/cortex-a15.md 1970-01-01 00:00:00 +0000 +++ new/gcc/config/arm/cortex-a15.md 2011-12-02 00:38:59 +0000 @@ -0,0 +1,186 @@ +;; ARM Cortex-A15 pipeline description +;; Copyright (C) 2011 Free Software Foundation, Inc. +;; +;; Written by Matthew Gretton-Dann + +;; This file is part of GCC. +;; +;; GCC is free software; you can redistribute it and/or modify it +;; under the terms of the GNU General Public License as published by +;; the Free Software Foundation; either version 3, or (at your option) +;; any later version. +;; +;; GCC is distributed in the hope that it will be useful, but +;; WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;; General Public License for more details. +;; +;; You should have received a copy of the GNU General Public License +;; along with GCC; see the file COPYING3. If not see +;; . + +(define_automaton "cortex_a15") + +;; The Cortex-A15 core is modelled as a triple issue pipeline that has +;; the following dispatch units. +;; 1. Two pipelines for simple integer operations: SX1, SX2 +;; 2. Two pipelines for Neon and FP data-processing operations: CX1, CX2 +;; 3. One pipeline for branch operations: BX +;; 4. One pipeline for integer multiply and divide operations: MX +;; 5. Two pipelines for load and store operations: LS1, LS2 +;; +;; We can issue into three pipelines per-cycle. +;; +;; We assume that where we have unit pairs xx1 is always filled before xx2. + +;; The three issue units +(define_cpu_unit "ca15_i0, ca15_i1, ca15_i2" "cortex_a15") + +(define_reservation "ca15_issue1" "(ca15_i0|ca15_i1|ca15_i2)") +(define_reservation "ca15_issue2" "((ca15_i0+ca15_i1)|(ca15_i1+ca15_i2))") +(define_reservation "ca15_issue3" "(ca15_i0+ca15_i1+ca15_i2)") +(final_presence_set "ca15_i1" "ca15_i0") +(final_presence_set "ca15_i2" "ca15_i1") + +;; The main dispatch units +(define_cpu_unit "ca15_sx1, ca15_sx2" "cortex_a15") +(define_cpu_unit "ca15_cx1, ca15_cx2" "cortex_a15") +(define_cpu_unit "ca15_ls1, ca15_ls2" "cortex_a15") +(define_cpu_unit "ca15_bx, ca15_mx" "cortex_a15") + +(define_reservation "ca15_ls" "(ca15_ls1|ca15_ls2)") + +;; The extended load-store pipeline +(define_cpu_unit "ca15_ldr, ca15_str" "cortex_a15") + +;; The extended ALU pipeline +(define_cpu_unit "ca15_sx1_alu, ca15_sx1_shf, ca15_sx1_sat" "cortex_a15") +(define_cpu_unit "ca15_sx2_alu, ca15_sx2_shf, ca15_sx2_sat" "cortex_a15") + +;; Simple Execution Unit: +;; +;; Simple ALU without shift +(define_insn_reservation "cortex_a15_alu" 2 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "alu") + (eq_attr "neon_type" "none"))) + "ca15_issue1,(ca15_sx1,ca15_sx1_alu)|(ca15_sx2,ca15_sx2_alu)") + +;; ALU ops with immediate shift +(define_insn_reservation "cortex_a15_alu_shift" 3 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "alu_shift") + (eq_attr "neon_type" "none"))) + "ca15_issue1,(ca15_sx1,ca15_sx1+ca15_sx1_shf,ca15_sx1_alu)\ + |(ca15_sx2,ca15_sx2+ca15_sx2_shf,ca15_sx2_alu)") + +;; ALU ops with register controlled shift +(define_insn_reservation "cortex_a15_alu_shift_reg" 3 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "alu_shift_reg") + (eq_attr "neon_type" "none"))) + "(ca15_issue2,ca15_sx1+ca15_sx2,ca15_sx1_shf,ca15_sx2_alu)\ + |(ca15_issue1,(ca15_issue1+ca15_sx2,ca15_sx1+ca15_sx2_shf)\ + |(ca15_issue1+ca15_sx1,ca15_sx1+ca15_sx1_shf),ca15_sx1_alu)") + +;; Multiply Execution Unit: +;; +;; 32-bit multiplies +(define_insn_reservation "cortex_a15_mult32" 3 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "mult") + (and (eq_attr "neon_type" "none") + (eq_attr "mul64" "no")))) + "ca15_issue1,ca15_mx") + +;; 64-bit multiplies +(define_insn_reservation "cortex_a15_mult64" 4 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "mult") + (and (eq_attr "neon_type" "none") + (eq_attr "mul64" "yes")))) + "ca15_issue1,ca15_mx*2") + +;; Integer divide +(define_insn_reservation "cortex_a15_udiv" 9 + (and (eq_attr "tune" "cortexa15") + (eq_attr "insn" "udiv")) + "ca15_issue1,ca15_mx") + +(define_insn_reservation "cortex_a15_sdiv" 10 + (and (eq_attr "tune" "cortexa15") + (eq_attr "insn" "sdiv")) + "ca15_issue1,ca15_mx") + +;; Block all issue pipes for a cycle +(define_insn_reservation "cortex_a15_block" 1 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "block") + (eq_attr "neon_type" "none"))) + "ca15_issue3") + +;; Branch execution Unit +;; +;; Branches take one issue slot. +;; No latency as there is no result +(define_insn_reservation "cortex_a15_branch" 0 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "branch") + (eq_attr "neon_type" "none"))) + "ca15_issue1,ca15_bx") + + +;; We lie with calls. They take up all issue slots, and form a block in the +;; pipeline. The result however is available the next cycle. +;; +;; Addition of new units requires this to be updated. +(define_insn_reservation "cortex_a15_call" 1 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "call") + (eq_attr "neon_type" "none"))) + "ca15_issue3,\ + ca15_sx1+ca15_sx2+ca15_bx+ca15_mx+ca15_cx1+ca15_cx2+ca15_ls1+ca15_ls2,\ + ca15_sx1_alu+ca15_sx1_shf+ca15_sx1_sat+ca15_sx2_alu+ca15_sx2_shf\ + +ca15_sx2_sat+ca15_ldr+ca15_str") + +;; Load-store execution Unit +;; +;; Loads of up to two words. +(define_insn_reservation "cortex_a15_load1" 4 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "load_byte,load1,load2") + (eq_attr "neon_type" "none"))) + "ca15_issue1,ca15_ls,ca15_ldr,nothing") + +;; Loads of three or four words. +(define_insn_reservation "cortex_a15_load3" 5 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "load3,load4") + (eq_attr "neon_type" "none"))) + "ca15_issue2,ca15_ls1+ca15_ls2,ca15_ldr,ca15_ldr,nothing") + +;; Stores of up to two words. +(define_insn_reservation "cortex_a15_store1" 0 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "store1,store2") + (eq_attr "neon_type" "none"))) + "ca15_issue1,ca15_ls,ca15_str") + +;; Stores of three or four words. +(define_insn_reservation "cortex_a15_store3" 0 + (and (eq_attr "tune" "cortexa15") + (and (eq_attr "type" "store3,store4") + (eq_attr "neon_type" "none"))) + "ca15_issue2,ca15_ls1+ca15_ls2,ca15_str,ca15_str") + +;; Simple execution unit bypasses +(define_bypass 1 "cortex_a15_alu" + "cortex_a15_alu,cortex_a15_alu_shift,cortex_a15_alu_shift_reg") +(define_bypass 2 "cortex_a15_alu_shift" + "cortex_a15_alu,cortex_a15_alu_shift,cortex_a15_alu_shift_reg") +(define_bypass 2 "cortex_a15_alu_shift_reg" + "cortex_a15_alu,cortex_a15_alu_shift,cortex_a15_alu_shift_reg") +(define_bypass 1 "cortex_a15_alu" "cortex_a15_load1,cortex_a15_load3") +(define_bypass 2 "cortex_a15_alu_shift" "cortex_a15_load1,cortex_a15_load3") +(define_bypass 2 "cortex_a15_alu_shift_reg" + "cortex_a15_load1,cortex_a15_load3") === modified file 'gcc/config/arm/t-arm' --- old/gcc/config/arm/t-arm 2011-01-03 20:52:22 +0000 +++ new/gcc/config/arm/t-arm 2011-12-02 00:38:59 +0000 @@ -31,6 +31,16 @@ $(srcdir)/config/arm/fmp626.md \ $(srcdir)/config/arm/fa726te.md \ $(srcdir)/config/arm/arm926ejs.md \ + $(srcdir)/config/arm/cortex-a15.md \ + $(srcdir)/config/arm/cortex-a5.md \ + $(srcdir)/config/arm/cortex-a8.md \ + $(srcdir)/config/arm/cortex-a8-neon.md \ + $(srcdir)/config/arm/cortex-a9.md \ + $(srcdir)/config/arm/cortex-a9-neon.md \ + $(srcdir)/config/arm/cortex-m4-fpu.md \ + $(srcdir)/config/arm/cortex-m4.md \ + $(srcdir)/config/arm/cortex-r4f.md \ + $(srcdir)/config/arm/cortex-r4.md \ $(srcdir)/config/arm/cirrus.md \ $(srcdir)/config/arm/fpa.md \ $(srcdir)/config/arm/vec-common.md \