Message ID | 20241217104729.4115134-1-bin.lan.cn@windriver.com |
---|---|
State | Accepted, archived |
Commit | 30d4f18d1e11b3336c8668dccd96b9ff35c7bc76 |
Headers | show |
Series | gcc: backport patch to fix data relocation to !ENDBR: stpcpy | expand |
On Tue, Dec 17, 2024 at 2:47 AM Bin Lan via lists.openembedded.org <bin.lan.cn=windriver.com@lists.openembedded.org> wrote: > > There is the following warning when building linux-yocto with > default configuration on x86-64 with gcc-14.2: > AR built-in.a > AR vmlinux.a > LD vmlinux.o > vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to !ENDBR: stpcpy+0x0 > > This change set removes the warning. > > PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174] > > Signed-off-by: Bin Lan <bin.lan.cn@windriver.com> > --- > meta/recipes-devtools/gcc/gcc-14.2.inc | 1 + > ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++ > 2 files changed, 448 insertions(+) > create mode 100644 meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch > > diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc > index 4f505bef68..a25bc019e5 100644 > --- a/meta/recipes-devtools/gcc/gcc-14.2.inc > +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc > @@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \ > file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \ > file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \ > file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \ > + file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \ tiny nit. is this tab vs spaces ? can you fix this. > file://gcc.git-ab884fffe3fc82a710bea66ad651720d71c938b8.patch \ > " > > diff --git a/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch > new file mode 100644 > index 0000000000..5bede60816 > --- /dev/null > +++ b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch > @@ -0,0 +1,447 @@ > +From 4e7735a8d87559bbddfe3a985786996e22241f8d Mon Sep 17 00:00:00 2001 > +From: liuhongt <hongtao.liu@intel.com> > +Date: Mon, 12 Aug 2024 14:35:31 +0800 > +Subject: [PATCH] Move ix86_align_loops into a separate pass and insert the > + pass after pass_endbr_and_patchable_area. > + > +gcc/ChangeLog: > + > + PR target/116174 > + * config/i386/i386.cc (ix86_align_loops): Move this to .. > + * config/i386/i386-features.cc (ix86_align_loops): .. here. > + (class pass_align_tight_loops): New class. > + (make_pass_align_tight_loops): New function. > + * config/i386/i386-passes.def: Insert pass_align_tight_loops > + after pass_insert_endbr_and_patchable_area. > + * config/i386/i386-protos.h (make_pass_align_tight_loops): New > + declare. > + > +gcc/testsuite/ChangeLog: > + > + * gcc.target/i386/pr116174.c: New test. > + > +(cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8) > + > +Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=patch;h=4e7735a8d87559bbddfe3a985786996e22241f8d] > + > +Signed-off-by: Bin Lan <bin.lan.cn@windriver.com> > +--- > + gcc/config/i386/i386-features.cc | 191 +++++++++++++++++++++++ > + gcc/config/i386/i386-passes.def | 3 + > + gcc/config/i386/i386-protos.h | 1 + > + gcc/config/i386/i386.cc | 146 ----------------- > + gcc/testsuite/gcc.target/i386/pr116174.c | 12 ++ > + 5 files changed, 207 insertions(+), 146 deletions(-) > + create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c > + > +diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc > +index e3e004d55267..7de19d423637 100644 > +--- a/gcc/config/i386/i386-features.cc > ++++ b/gcc/config/i386/i386-features.cc > +@@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) > + return new pass_remove_partial_avx_dependency (ctxt); > + } > + > ++/* When a hot loop can be fit into one cacheline, > ++ force align the loop without considering the max skip. */ > ++static void > ++ix86_align_loops () > ++{ > ++ basic_block bb; > ++ > ++ /* Don't do this when we don't know cache line size. */ > ++ if (ix86_cost->prefetch_block == 0) > ++ return; > ++ > ++ loop_optimizer_init (AVOID_CFG_MODIFICATIONS); > ++ profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; > ++ FOR_EACH_BB_FN (bb, cfun) > ++ { > ++ rtx_insn *label = BB_HEAD (bb); > ++ bool has_fallthru = 0; > ++ edge e; > ++ edge_iterator ei; > ++ > ++ if (!LABEL_P (label)) > ++ continue; > ++ > ++ profile_count fallthru_count = profile_count::zero (); > ++ profile_count branch_count = profile_count::zero (); > ++ > ++ FOR_EACH_EDGE (e, ei, bb->preds) > ++ { > ++ if (e->flags & EDGE_FALLTHRU) > ++ has_fallthru = 1, fallthru_count += e->count (); > ++ else > ++ branch_count += e->count (); > ++ } > ++ > ++ if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) > ++ continue; > ++ > ++ if (bb->loop_father > ++ && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) > ++ && (has_fallthru > ++ ? (!(single_succ_p (bb) > ++ && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) > ++ && optimize_bb_for_speed_p (bb) > ++ && branch_count + fallthru_count > count_threshold > ++ && (branch_count > fallthru_count * param_align_loop_iterations)) > ++ /* In case there'no fallthru for the loop. > ++ Nops inserted won't be executed. */ > ++ : (branch_count > count_threshold > ++ || (bb->count > bb->prev_bb->count * 10 > ++ && (bb->prev_bb->count > ++ <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) > ++ { > ++ rtx_insn* insn, *end_insn; > ++ HOST_WIDE_INT size = 0; > ++ bool padding_p = true; > ++ basic_block tbb = bb; > ++ unsigned cond_branch_num = 0; > ++ bool detect_tight_loop_p = false; > ++ > ++ for (unsigned int i = 0; i != bb->loop_father->num_nodes; > ++ i++, tbb = tbb->next_bb) > ++ { > ++ /* Only handle continuous cfg layout. */ > ++ if (bb->loop_father != tbb->loop_father) > ++ { > ++ padding_p = false; > ++ break; > ++ } > ++ > ++ FOR_BB_INSNS (tbb, insn) > ++ { > ++ if (!NONDEBUG_INSN_P (insn)) > ++ continue; > ++ size += ix86_min_insn_size (insn); > ++ > ++ /* We don't know size of inline asm. > ++ Don't align loop for call. */ > ++ if (asm_noperands (PATTERN (insn)) >= 0 > ++ || CALL_P (insn)) > ++ { > ++ size = -1; > ++ break; > ++ } > ++ } > ++ > ++ if (size == -1 || size > ix86_cost->prefetch_block) > ++ { > ++ padding_p = false; > ++ break; > ++ } > ++ > ++ FOR_EACH_EDGE (e, ei, tbb->succs) > ++ { > ++ /* It could be part of the loop. */ > ++ if (e->dest == bb) > ++ { > ++ detect_tight_loop_p = true; > ++ break; > ++ } > ++ } > ++ > ++ if (detect_tight_loop_p) > ++ break; > ++ > ++ end_insn = BB_END (tbb); > ++ if (JUMP_P (end_insn)) > ++ { > ++ /* For decoded icache: > ++ 1. Up to two branches are allowed per Way. > ++ 2. A non-conditional branch is the last micro-op in a Way. > ++ */ > ++ if (onlyjump_p (end_insn) > ++ && (any_uncondjump_p (end_insn) > ++ || single_succ_p (tbb))) > ++ { > ++ padding_p = false; > ++ break; > ++ } > ++ else if (++cond_branch_num >= 2) > ++ { > ++ padding_p = false; > ++ break; > ++ } > ++ } > ++ > ++ } > ++ > ++ if (padding_p && detect_tight_loop_p) > ++ { > ++ emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), > ++ GEN_INT (0)), label); > ++ /* End of function. */ > ++ if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) > ++ break; > ++ /* Skip bb which already fits into one cacheline. */ > ++ bb = tbb; > ++ } > ++ } > ++ } > ++ > ++ loop_optimizer_finalize (); > ++ free_dominance_info (CDI_DOMINATORS); > ++} > ++ > ++namespace { > ++ > ++const pass_data pass_data_align_tight_loops = > ++{ > ++ RTL_PASS, /* type */ > ++ "align_tight_loops", /* name */ > ++ OPTGROUP_NONE, /* optinfo_flags */ > ++ TV_MACH_DEP, /* tv_id */ > ++ 0, /* properties_required */ > ++ 0, /* properties_provided */ > ++ 0, /* properties_destroyed */ > ++ 0, /* todo_flags_start */ > ++ 0, /* todo_flags_finish */ > ++}; > ++ > ++class pass_align_tight_loops : public rtl_opt_pass > ++{ > ++public: > ++ pass_align_tight_loops (gcc::context *ctxt) > ++ : rtl_opt_pass (pass_data_align_tight_loops, ctxt) > ++ {} > ++ > ++ /* opt_pass methods: */ > ++ bool gate (function *) final override > ++ { > ++ return optimize && optimize_function_for_speed_p (cfun); > ++ } > ++ > ++ unsigned int execute (function *) final override > ++ { > ++ timevar_push (TV_MACH_DEP); > ++#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN > ++ ix86_align_loops (); > ++#endif > ++ timevar_pop (TV_MACH_DEP); > ++ return 0; > ++ } > ++}; // class pass_align_tight_loops > ++ > ++} // anon namespace > ++ > ++rtl_opt_pass * > ++make_pass_align_tight_loops (gcc::context *ctxt) > ++{ > ++ return new pass_align_tight_loops (ctxt); > ++} > ++ > + /* This compares the priority of target features in function DECL1 > + and DECL2. It returns positive value if DECL1 is higher priority, > + negative value if DECL2 is higher priority and 0 if they are the > +diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def > +index 7d96766f7b96..e500f15c9971 100644 > +--- a/gcc/config/i386/i386-passes.def > ++++ b/gcc/config/i386/i386-passes.def > +@@ -31,5 +31,8 @@ along with GCC; see the file COPYING3. If not see > + INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */); > + > + INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); > ++ /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area. > ++ PR116174. */ > ++ INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); > + > + INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); > +diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > +index 46214a63974d..36c7b1aed42b 100644 > +--- a/gcc/config/i386/i386-protos.h > ++++ b/gcc/config/i386/i386-protos.h > +@@ -419,6 +419,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area > + (gcc::context *); > + extern rtl_opt_pass *make_pass_remove_partial_avx_dependency > + (gcc::context *); > ++extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); > + > + extern bool ix86_has_no_direct_extern_access; > + > +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > +index 6f89891d3cb5..288c69467d62 100644 > +--- a/gcc/config/i386/i386.cc > ++++ b/gcc/config/i386/i386.cc > +@@ -23444,150 +23444,6 @@ ix86_split_stlf_stall_load () > + } > + } > + > +-/* When a hot loop can be fit into one cacheline, > +- force align the loop without considering the max skip. */ > +-static void > +-ix86_align_loops () > +-{ > +- basic_block bb; > +- > +- /* Don't do this when we don't know cache line size. */ > +- if (ix86_cost->prefetch_block == 0) > +- return; > +- > +- loop_optimizer_init (AVOID_CFG_MODIFICATIONS); > +- profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; > +- FOR_EACH_BB_FN (bb, cfun) > +- { > +- rtx_insn *label = BB_HEAD (bb); > +- bool has_fallthru = 0; > +- edge e; > +- edge_iterator ei; > +- > +- if (!LABEL_P (label)) > +- continue; > +- > +- profile_count fallthru_count = profile_count::zero (); > +- profile_count branch_count = profile_count::zero (); > +- > +- FOR_EACH_EDGE (e, ei, bb->preds) > +- { > +- if (e->flags & EDGE_FALLTHRU) > +- has_fallthru = 1, fallthru_count += e->count (); > +- else > +- branch_count += e->count (); > +- } > +- > +- if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) > +- continue; > +- > +- if (bb->loop_father > +- && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) > +- && (has_fallthru > +- ? (!(single_succ_p (bb) > +- && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) > +- && optimize_bb_for_speed_p (bb) > +- && branch_count + fallthru_count > count_threshold > +- && (branch_count > fallthru_count * param_align_loop_iterations)) > +- /* In case there'no fallthru for the loop. > +- Nops inserted won't be executed. */ > +- : (branch_count > count_threshold > +- || (bb->count > bb->prev_bb->count * 10 > +- && (bb->prev_bb->count > +- <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) > +- { > +- rtx_insn* insn, *end_insn; > +- HOST_WIDE_INT size = 0; > +- bool padding_p = true; > +- basic_block tbb = bb; > +- unsigned cond_branch_num = 0; > +- bool detect_tight_loop_p = false; > +- > +- for (unsigned int i = 0; i != bb->loop_father->num_nodes; > +- i++, tbb = tbb->next_bb) > +- { > +- /* Only handle continuous cfg layout. */ > +- if (bb->loop_father != tbb->loop_father) > +- { > +- padding_p = false; > +- break; > +- } > +- > +- FOR_BB_INSNS (tbb, insn) > +- { > +- if (!NONDEBUG_INSN_P (insn)) > +- continue; > +- size += ix86_min_insn_size (insn); > +- > +- /* We don't know size of inline asm. > +- Don't align loop for call. */ > +- if (asm_noperands (PATTERN (insn)) >= 0 > +- || CALL_P (insn)) > +- { > +- size = -1; > +- break; > +- } > +- } > +- > +- if (size == -1 || size > ix86_cost->prefetch_block) > +- { > +- padding_p = false; > +- break; > +- } > +- > +- FOR_EACH_EDGE (e, ei, tbb->succs) > +- { > +- /* It could be part of the loop. */ > +- if (e->dest == bb) > +- { > +- detect_tight_loop_p = true; > +- break; > +- } > +- } > +- > +- if (detect_tight_loop_p) > +- break; > +- > +- end_insn = BB_END (tbb); > +- if (JUMP_P (end_insn)) > +- { > +- /* For decoded icache: > +- 1. Up to two branches are allowed per Way. > +- 2. A non-conditional branch is the last micro-op in a Way. > +- */ > +- if (onlyjump_p (end_insn) > +- && (any_uncondjump_p (end_insn) > +- || single_succ_p (tbb))) > +- { > +- padding_p = false; > +- break; > +- } > +- else if (++cond_branch_num >= 2) > +- { > +- padding_p = false; > +- break; > +- } > +- } > +- > +- } > +- > +- if (padding_p && detect_tight_loop_p) > +- { > +- emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), > +- GEN_INT (0)), label); > +- /* End of function. */ > +- if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) > +- break; > +- /* Skip bb which already fits into one cacheline. */ > +- bb = tbb; > +- } > +- } > +- } > +- > +- loop_optimizer_finalize (); > +- free_dominance_info (CDI_DOMINATORS); > +-} > +- > + /* Implement machine specific optimizations. We implement padding of returns > + for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ > + static void > +@@ -23611,8 +23467,6 @@ ix86_reorg (void) > + #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN > + if (TARGET_FOUR_JUMP_LIMIT) > + ix86_avoid_jump_mispredicts (); > +- > +- ix86_align_loops (); > + #endif > + } > + } > +diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c > +new file mode 100644 > +index 000000000000..8877d0b51af1 > +--- /dev/null > ++++ b/gcc/testsuite/gcc.target/i386/pr116174.c > +@@ -0,0 +1,12 @@ > ++/* { dg-do compile { target *-*-linux* } } */ > ++/* { dg-options "-O2 -fcf-protection=branch" } */ > ++ > ++char * > ++foo (char *dest, const char *src) > ++{ > ++ while ((*dest++ = *src++) != '\0') > ++ /* nothing */; > ++ return --dest; > ++} > ++ > ++/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */ > +-- > +2.43.5 > -- > 2.34.1 > > > -=-=-=-=-=-=-=-=-=-=-=- > Links: You receive all messages sent to this group. > View/Reply Online (#208831): https://lists.openembedded.org/g/openembedded-core/message/208831 > Mute This Topic: https://lists.openembedded.org/mt/110160747/1997914 > Group Owner: openembedded-core+owner@lists.openembedded.org > Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [raj.khem@gmail.com] > -=-=-=-=-=-=-=-=-=-=-=- >
On Tue, 2024-12-17 at 09:20 -0800, Khem Raj via lists.openembedded.org wrote: > On Tue, Dec 17, 2024 at 2:47 AM Bin Lan via lists.openembedded.org > <bin.lan.cn=windriver.com@lists.openembedded.org> wrote: > > > > There is the following warning when building linux-yocto with > > default configuration on x86-64 with gcc-14.2: > > AR built-in.a > > AR vmlinux.a > > LD vmlinux.o > > vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to !ENDBR: stpcpy+0x0 > > > > This change set removes the warning. > > > > PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174] > > > > Signed-off-by: Bin Lan <bin.lan.cn@windriver.com> > > --- > > meta/recipes-devtools/gcc/gcc-14.2.inc | 1 + > > ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++ > > 2 files changed, 448 insertions(+) > > create mode 100644 meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch > > > > diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc > > index 4f505bef68..a25bc019e5 100644 > > --- a/meta/recipes-devtools/gcc/gcc-14.2.inc > > +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc > > @@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \ > > file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \ > > file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \ > > file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \ > > + file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \ > > tiny nit. > is this tab vs spaces ? can you fix this. Looks to be in the original file rather than an addition by this patch :/. Cheers, Richard
On Tue, Dec 17, 2024 at 1:14 PM Richard Purdie <richard.purdie@linuxfoundation.org> wrote: > > On Tue, 2024-12-17 at 09:20 -0800, Khem Raj via lists.openembedded.org wrote: > > On Tue, Dec 17, 2024 at 2:47 AM Bin Lan via lists.openembedded.org > > <bin.lan.cn=windriver.com@lists.openembedded.org> wrote: > > > > > > There is the following warning when building linux-yocto with > > > default configuration on x86-64 with gcc-14.2: > > > AR built-in.a > > > AR vmlinux.a > > > LD vmlinux.o > > > vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to !ENDBR: stpcpy+0x0 > > > > > > This change set removes the warning. > > > > > > PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174] > > > > > > Signed-off-by: Bin Lan <bin.lan.cn@windriver.com> > > > --- > > > meta/recipes-devtools/gcc/gcc-14.2.inc | 1 + > > > ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++ > > > 2 files changed, 448 insertions(+) > > > create mode 100644 meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch > > > > > > diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc > > > index 4f505bef68..a25bc019e5 100644 > > > --- a/meta/recipes-devtools/gcc/gcc-14.2.inc > > > +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc > > > @@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \ > > > file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \ > > > file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \ > > > file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \ > > > + file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \ > > > > tiny nit. > > is this tab vs spaces ? can you fix this. > > Looks to be in the original file rather than an addition by this patch :/. sadly yes. > > Cheers, > > Richard
On 12/18/24 01:20, Khem Raj wrote: > CAUTION: This email comes from a non Wind River email account! > Do not click links or open attachments unless you recognize the sender and know the content is safe. > > On Tue, Dec 17, 2024 at 2:47 AM Bin Lan via lists.openembedded.org > <bin.lan.cn=windriver.com@lists.openembedded.org> wrote: >> There is the following warning when building linux-yocto with >> default configuration on x86-64 with gcc-14.2: >> AR built-in.a >> AR vmlinux.a >> LD vmlinux.o >> vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to !ENDBR: stpcpy+0x0 >> >> This change set removes the warning. >> >> PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174] >> >> Signed-off-by: Bin Lan <bin.lan.cn@windriver.com> >> --- >> meta/recipes-devtools/gcc/gcc-14.2.inc | 1 + >> ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++ >> 2 files changed, 448 insertions(+) >> create mode 100644 meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch >> >> diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc >> index 4f505bef68..a25bc019e5 100644 >> --- a/meta/recipes-devtools/gcc/gcc-14.2.inc >> +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc >> @@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \ >> file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \ >> file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \ >> file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \ >> + file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \ > tiny nit. > is this tab vs spaces ? can you fix this. I have fixed it and the v2 patch is sent out. //Bin Lan > >> file://gcc.git-ab884fffe3fc82a710bea66ad651720d71c938b8.patch \ >> " >> >> diff --git a/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch >> new file mode 100644 >> index 0000000000..5bede60816 >> --- /dev/null >> +++ b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch >> @@ -0,0 +1,447 @@ >> +From 4e7735a8d87559bbddfe3a985786996e22241f8d Mon Sep 17 00:00:00 2001 >> +From: liuhongt <hongtao.liu@intel.com> >> +Date: Mon, 12 Aug 2024 14:35:31 +0800 >> +Subject: [PATCH] Move ix86_align_loops into a separate pass and insert the >> + pass after pass_endbr_and_patchable_area. >> + >> +gcc/ChangeLog: >> + >> + PR target/116174 >> + * config/i386/i386.cc (ix86_align_loops): Move this to .. >> + * config/i386/i386-features.cc (ix86_align_loops): .. here. >> + (class pass_align_tight_loops): New class. >> + (make_pass_align_tight_loops): New function. >> + * config/i386/i386-passes.def: Insert pass_align_tight_loops >> + after pass_insert_endbr_and_patchable_area. >> + * config/i386/i386-protos.h (make_pass_align_tight_loops): New >> + declare. >> + >> +gcc/testsuite/ChangeLog: >> + >> + * gcc.target/i386/pr116174.c: New test. >> + >> +(cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8) >> + >> +Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=patch;h=4e7735a8d87559bbddfe3a985786996e22241f8d] >> + >> +Signed-off-by: Bin Lan <bin.lan.cn@windriver.com> >> +--- >> + gcc/config/i386/i386-features.cc | 191 +++++++++++++++++++++++ >> + gcc/config/i386/i386-passes.def | 3 + >> + gcc/config/i386/i386-protos.h | 1 + >> + gcc/config/i386/i386.cc | 146 ----------------- >> + gcc/testsuite/gcc.target/i386/pr116174.c | 12 ++ >> + 5 files changed, 207 insertions(+), 146 deletions(-) >> + create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c >> + >> +diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc >> +index e3e004d55267..7de19d423637 100644 >> +--- a/gcc/config/i386/i386-features.cc >> ++++ b/gcc/config/i386/i386-features.cc >> +@@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) >> + return new pass_remove_partial_avx_dependency (ctxt); >> + } >> + >> ++/* When a hot loop can be fit into one cacheline, >> ++ force align the loop without considering the max skip. */ >> ++static void >> ++ix86_align_loops () >> ++{ >> ++ basic_block bb; >> ++ >> ++ /* Don't do this when we don't know cache line size. */ >> ++ if (ix86_cost->prefetch_block == 0) >> ++ return; >> ++ >> ++ loop_optimizer_init (AVOID_CFG_MODIFICATIONS); >> ++ profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; >> ++ FOR_EACH_BB_FN (bb, cfun) >> ++ { >> ++ rtx_insn *label = BB_HEAD (bb); >> ++ bool has_fallthru = 0; >> ++ edge e; >> ++ edge_iterator ei; >> ++ >> ++ if (!LABEL_P (label)) >> ++ continue; >> ++ >> ++ profile_count fallthru_count = profile_count::zero (); >> ++ profile_count branch_count = profile_count::zero (); >> ++ >> ++ FOR_EACH_EDGE (e, ei, bb->preds) >> ++ { >> ++ if (e->flags & EDGE_FALLTHRU) >> ++ has_fallthru = 1, fallthru_count += e->count (); >> ++ else >> ++ branch_count += e->count (); >> ++ } >> ++ >> ++ if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) >> ++ continue; >> ++ >> ++ if (bb->loop_father >> ++ && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) >> ++ && (has_fallthru >> ++ ? (!(single_succ_p (bb) >> ++ && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) >> ++ && optimize_bb_for_speed_p (bb) >> ++ && branch_count + fallthru_count > count_threshold >> ++ && (branch_count > fallthru_count * param_align_loop_iterations)) >> ++ /* In case there'no fallthru for the loop. >> ++ Nops inserted won't be executed. */ >> ++ : (branch_count > count_threshold >> ++ || (bb->count > bb->prev_bb->count * 10 >> ++ && (bb->prev_bb->count >> ++ <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) >> ++ { >> ++ rtx_insn* insn, *end_insn; >> ++ HOST_WIDE_INT size = 0; >> ++ bool padding_p = true; >> ++ basic_block tbb = bb; >> ++ unsigned cond_branch_num = 0; >> ++ bool detect_tight_loop_p = false; >> ++ >> ++ for (unsigned int i = 0; i != bb->loop_father->num_nodes; >> ++ i++, tbb = tbb->next_bb) >> ++ { >> ++ /* Only handle continuous cfg layout. */ >> ++ if (bb->loop_father != tbb->loop_father) >> ++ { >> ++ padding_p = false; >> ++ break; >> ++ } >> ++ >> ++ FOR_BB_INSNS (tbb, insn) >> ++ { >> ++ if (!NONDEBUG_INSN_P (insn)) >> ++ continue; >> ++ size += ix86_min_insn_size (insn); >> ++ >> ++ /* We don't know size of inline asm. >> ++ Don't align loop for call. */ >> ++ if (asm_noperands (PATTERN (insn)) >= 0 >> ++ || CALL_P (insn)) >> ++ { >> ++ size = -1; >> ++ break; >> ++ } >> ++ } >> ++ >> ++ if (size == -1 || size > ix86_cost->prefetch_block) >> ++ { >> ++ padding_p = false; >> ++ break; >> ++ } >> ++ >> ++ FOR_EACH_EDGE (e, ei, tbb->succs) >> ++ { >> ++ /* It could be part of the loop. */ >> ++ if (e->dest == bb) >> ++ { >> ++ detect_tight_loop_p = true; >> ++ break; >> ++ } >> ++ } >> ++ >> ++ if (detect_tight_loop_p) >> ++ break; >> ++ >> ++ end_insn = BB_END (tbb); >> ++ if (JUMP_P (end_insn)) >> ++ { >> ++ /* For decoded icache: >> ++ 1. Up to two branches are allowed per Way. >> ++ 2. A non-conditional branch is the last micro-op in a Way. >> ++ */ >> ++ if (onlyjump_p (end_insn) >> ++ && (any_uncondjump_p (end_insn) >> ++ || single_succ_p (tbb))) >> ++ { >> ++ padding_p = false; >> ++ break; >> ++ } >> ++ else if (++cond_branch_num >= 2) >> ++ { >> ++ padding_p = false; >> ++ break; >> ++ } >> ++ } >> ++ >> ++ } >> ++ >> ++ if (padding_p && detect_tight_loop_p) >> ++ { >> ++ emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), >> ++ GEN_INT (0)), label); >> ++ /* End of function. */ >> ++ if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) >> ++ break; >> ++ /* Skip bb which already fits into one cacheline. */ >> ++ bb = tbb; >> ++ } >> ++ } >> ++ } >> ++ >> ++ loop_optimizer_finalize (); >> ++ free_dominance_info (CDI_DOMINATORS); >> ++} >> ++ >> ++namespace { >> ++ >> ++const pass_data pass_data_align_tight_loops = >> ++{ >> ++ RTL_PASS, /* type */ >> ++ "align_tight_loops", /* name */ >> ++ OPTGROUP_NONE, /* optinfo_flags */ >> ++ TV_MACH_DEP, /* tv_id */ >> ++ 0, /* properties_required */ >> ++ 0, /* properties_provided */ >> ++ 0, /* properties_destroyed */ >> ++ 0, /* todo_flags_start */ >> ++ 0, /* todo_flags_finish */ >> ++}; >> ++ >> ++class pass_align_tight_loops : public rtl_opt_pass >> ++{ >> ++public: >> ++ pass_align_tight_loops (gcc::context *ctxt) >> ++ : rtl_opt_pass (pass_data_align_tight_loops, ctxt) >> ++ {} >> ++ >> ++ /* opt_pass methods: */ >> ++ bool gate (function *) final override >> ++ { >> ++ return optimize && optimize_function_for_speed_p (cfun); >> ++ } >> ++ >> ++ unsigned int execute (function *) final override >> ++ { >> ++ timevar_push (TV_MACH_DEP); >> ++#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN >> ++ ix86_align_loops (); >> ++#endif >> ++ timevar_pop (TV_MACH_DEP); >> ++ return 0; >> ++ } >> ++}; // class pass_align_tight_loops >> ++ >> ++} // anon namespace >> ++ >> ++rtl_opt_pass * >> ++make_pass_align_tight_loops (gcc::context *ctxt) >> ++{ >> ++ return new pass_align_tight_loops (ctxt); >> ++} >> ++ >> + /* This compares the priority of target features in function DECL1 >> + and DECL2. It returns positive value if DECL1 is higher priority, >> + negative value if DECL2 is higher priority and 0 if they are the >> +diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def >> +index 7d96766f7b96..e500f15c9971 100644 >> +--- a/gcc/config/i386/i386-passes.def >> ++++ b/gcc/config/i386/i386-passes.def >> +@@ -31,5 +31,8 @@ along with GCC; see the file COPYING3. If not see >> + INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */); >> + >> + INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); >> ++ /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area. >> ++ PR116174. */ >> ++ INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); >> + >> + INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); >> +diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h >> +index 46214a63974d..36c7b1aed42b 100644 >> +--- a/gcc/config/i386/i386-protos.h >> ++++ b/gcc/config/i386/i386-protos.h >> +@@ -419,6 +419,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area >> + (gcc::context *); >> + extern rtl_opt_pass *make_pass_remove_partial_avx_dependency >> + (gcc::context *); >> ++extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); >> + >> + extern bool ix86_has_no_direct_extern_access; >> + >> +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc >> +index 6f89891d3cb5..288c69467d62 100644 >> +--- a/gcc/config/i386/i386.cc >> ++++ b/gcc/config/i386/i386.cc >> +@@ -23444,150 +23444,6 @@ ix86_split_stlf_stall_load () >> + } >> + } >> + >> +-/* When a hot loop can be fit into one cacheline, >> +- force align the loop without considering the max skip. */ >> +-static void >> +-ix86_align_loops () >> +-{ >> +- basic_block bb; >> +- >> +- /* Don't do this when we don't know cache line size. */ >> +- if (ix86_cost->prefetch_block == 0) >> +- return; >> +- >> +- loop_optimizer_init (AVOID_CFG_MODIFICATIONS); >> +- profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; >> +- FOR_EACH_BB_FN (bb, cfun) >> +- { >> +- rtx_insn *label = BB_HEAD (bb); >> +- bool has_fallthru = 0; >> +- edge e; >> +- edge_iterator ei; >> +- >> +- if (!LABEL_P (label)) >> +- continue; >> +- >> +- profile_count fallthru_count = profile_count::zero (); >> +- profile_count branch_count = profile_count::zero (); >> +- >> +- FOR_EACH_EDGE (e, ei, bb->preds) >> +- { >> +- if (e->flags & EDGE_FALLTHRU) >> +- has_fallthru = 1, fallthru_count += e->count (); >> +- else >> +- branch_count += e->count (); >> +- } >> +- >> +- if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) >> +- continue; >> +- >> +- if (bb->loop_father >> +- && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) >> +- && (has_fallthru >> +- ? (!(single_succ_p (bb) >> +- && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) >> +- && optimize_bb_for_speed_p (bb) >> +- && branch_count + fallthru_count > count_threshold >> +- && (branch_count > fallthru_count * param_align_loop_iterations)) >> +- /* In case there'no fallthru for the loop. >> +- Nops inserted won't be executed. */ >> +- : (branch_count > count_threshold >> +- || (bb->count > bb->prev_bb->count * 10 >> +- && (bb->prev_bb->count >> +- <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) >> +- { >> +- rtx_insn* insn, *end_insn; >> +- HOST_WIDE_INT size = 0; >> +- bool padding_p = true; >> +- basic_block tbb = bb; >> +- unsigned cond_branch_num = 0; >> +- bool detect_tight_loop_p = false; >> +- >> +- for (unsigned int i = 0; i != bb->loop_father->num_nodes; >> +- i++, tbb = tbb->next_bb) >> +- { >> +- /* Only handle continuous cfg layout. */ >> +- if (bb->loop_father != tbb->loop_father) >> +- { >> +- padding_p = false; >> +- break; >> +- } >> +- >> +- FOR_BB_INSNS (tbb, insn) >> +- { >> +- if (!NONDEBUG_INSN_P (insn)) >> +- continue; >> +- size += ix86_min_insn_size (insn); >> +- >> +- /* We don't know size of inline asm. >> +- Don't align loop for call. */ >> +- if (asm_noperands (PATTERN (insn)) >= 0 >> +- || CALL_P (insn)) >> +- { >> +- size = -1; >> +- break; >> +- } >> +- } >> +- >> +- if (size == -1 || size > ix86_cost->prefetch_block) >> +- { >> +- padding_p = false; >> +- break; >> +- } >> +- >> +- FOR_EACH_EDGE (e, ei, tbb->succs) >> +- { >> +- /* It could be part of the loop. */ >> +- if (e->dest == bb) >> +- { >> +- detect_tight_loop_p = true; >> +- break; >> +- } >> +- } >> +- >> +- if (detect_tight_loop_p) >> +- break; >> +- >> +- end_insn = BB_END (tbb); >> +- if (JUMP_P (end_insn)) >> +- { >> +- /* For decoded icache: >> +- 1. Up to two branches are allowed per Way. >> +- 2. A non-conditional branch is the last micro-op in a Way. >> +- */ >> +- if (onlyjump_p (end_insn) >> +- && (any_uncondjump_p (end_insn) >> +- || single_succ_p (tbb))) >> +- { >> +- padding_p = false; >> +- break; >> +- } >> +- else if (++cond_branch_num >= 2) >> +- { >> +- padding_p = false; >> +- break; >> +- } >> +- } >> +- >> +- } >> +- >> +- if (padding_p && detect_tight_loop_p) >> +- { >> +- emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), >> +- GEN_INT (0)), label); >> +- /* End of function. */ >> +- if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) >> +- break; >> +- /* Skip bb which already fits into one cacheline. */ >> +- bb = tbb; >> +- } >> +- } >> +- } >> +- >> +- loop_optimizer_finalize (); >> +- free_dominance_info (CDI_DOMINATORS); >> +-} >> +- >> + /* Implement machine specific optimizations. We implement padding of returns >> + for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ >> + static void >> +@@ -23611,8 +23467,6 @@ ix86_reorg (void) >> + #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN >> + if (TARGET_FOUR_JUMP_LIMIT) >> + ix86_avoid_jump_mispredicts (); >> +- >> +- ix86_align_loops (); >> + #endif >> + } >> + } >> +diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c >> +new file mode 100644 >> +index 000000000000..8877d0b51af1 >> +--- /dev/null >> ++++ b/gcc/testsuite/gcc.target/i386/pr116174.c >> +@@ -0,0 +1,12 @@ >> ++/* { dg-do compile { target *-*-linux* } } */ >> ++/* { dg-options "-O2 -fcf-protection=branch" } */ >> ++ >> ++char * >> ++foo (char *dest, const char *src) >> ++{ >> ++ while ((*dest++ = *src++) != '\0') >> ++ /* nothing */; >> ++ return --dest; >> ++} >> ++ >> ++/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */ >> +-- >> +2.43.5 >> -- >> 2.34.1 >> >> >> -=-=-=-=-=-=-=-=-=-=-=- >> Links: You receive all messages sent to this group. >> View/Reply Online (#208831): https://lists.openembedded.org/g/openembedded-core/message/208831 >> Mute This Topic: https://lists.openembedded.org/mt/110160747/1997914 >> Group Owner: openembedded-core+owner@lists.openembedded.org >> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [raj.khem@gmail.com] >> -=-=-=-=-=-=-=-=-=-=-=- >>
diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc index 4f505bef68..a25bc019e5 100644 --- a/meta/recipes-devtools/gcc/gcc-14.2.inc +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc @@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \ file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \ file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \ file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \ + file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \ file://gcc.git-ab884fffe3fc82a710bea66ad651720d71c938b8.patch \ " diff --git a/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch new file mode 100644 index 0000000000..5bede60816 --- /dev/null +++ b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch @@ -0,0 +1,447 @@ +From 4e7735a8d87559bbddfe3a985786996e22241f8d Mon Sep 17 00:00:00 2001 +From: liuhongt <hongtao.liu@intel.com> +Date: Mon, 12 Aug 2024 14:35:31 +0800 +Subject: [PATCH] Move ix86_align_loops into a separate pass and insert the + pass after pass_endbr_and_patchable_area. + +gcc/ChangeLog: + + PR target/116174 + * config/i386/i386.cc (ix86_align_loops): Move this to .. + * config/i386/i386-features.cc (ix86_align_loops): .. here. + (class pass_align_tight_loops): New class. + (make_pass_align_tight_loops): New function. + * config/i386/i386-passes.def: Insert pass_align_tight_loops + after pass_insert_endbr_and_patchable_area. + * config/i386/i386-protos.h (make_pass_align_tight_loops): New + declare. + +gcc/testsuite/ChangeLog: + + * gcc.target/i386/pr116174.c: New test. + +(cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8) + +Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=patch;h=4e7735a8d87559bbddfe3a985786996e22241f8d] + +Signed-off-by: Bin Lan <bin.lan.cn@windriver.com> +--- + gcc/config/i386/i386-features.cc | 191 +++++++++++++++++++++++ + gcc/config/i386/i386-passes.def | 3 + + gcc/config/i386/i386-protos.h | 1 + + gcc/config/i386/i386.cc | 146 ----------------- + gcc/testsuite/gcc.target/i386/pr116174.c | 12 ++ + 5 files changed, 207 insertions(+), 146 deletions(-) + create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c + +diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc +index e3e004d55267..7de19d423637 100644 +--- a/gcc/config/i386/i386-features.cc ++++ b/gcc/config/i386/i386-features.cc +@@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt) + return new pass_remove_partial_avx_dependency (ctxt); + } + ++/* When a hot loop can be fit into one cacheline, ++ force align the loop without considering the max skip. */ ++static void ++ix86_align_loops () ++{ ++ basic_block bb; ++ ++ /* Don't do this when we don't know cache line size. */ ++ if (ix86_cost->prefetch_block == 0) ++ return; ++ ++ loop_optimizer_init (AVOID_CFG_MODIFICATIONS); ++ profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; ++ FOR_EACH_BB_FN (bb, cfun) ++ { ++ rtx_insn *label = BB_HEAD (bb); ++ bool has_fallthru = 0; ++ edge e; ++ edge_iterator ei; ++ ++ if (!LABEL_P (label)) ++ continue; ++ ++ profile_count fallthru_count = profile_count::zero (); ++ profile_count branch_count = profile_count::zero (); ++ ++ FOR_EACH_EDGE (e, ei, bb->preds) ++ { ++ if (e->flags & EDGE_FALLTHRU) ++ has_fallthru = 1, fallthru_count += e->count (); ++ else ++ branch_count += e->count (); ++ } ++ ++ if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) ++ continue; ++ ++ if (bb->loop_father ++ && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) ++ && (has_fallthru ++ ? (!(single_succ_p (bb) ++ && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) ++ && optimize_bb_for_speed_p (bb) ++ && branch_count + fallthru_count > count_threshold ++ && (branch_count > fallthru_count * param_align_loop_iterations)) ++ /* In case there'no fallthru for the loop. ++ Nops inserted won't be executed. */ ++ : (branch_count > count_threshold ++ || (bb->count > bb->prev_bb->count * 10 ++ && (bb->prev_bb->count ++ <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) ++ { ++ rtx_insn* insn, *end_insn; ++ HOST_WIDE_INT size = 0; ++ bool padding_p = true; ++ basic_block tbb = bb; ++ unsigned cond_branch_num = 0; ++ bool detect_tight_loop_p = false; ++ ++ for (unsigned int i = 0; i != bb->loop_father->num_nodes; ++ i++, tbb = tbb->next_bb) ++ { ++ /* Only handle continuous cfg layout. */ ++ if (bb->loop_father != tbb->loop_father) ++ { ++ padding_p = false; ++ break; ++ } ++ ++ FOR_BB_INSNS (tbb, insn) ++ { ++ if (!NONDEBUG_INSN_P (insn)) ++ continue; ++ size += ix86_min_insn_size (insn); ++ ++ /* We don't know size of inline asm. ++ Don't align loop for call. */ ++ if (asm_noperands (PATTERN (insn)) >= 0 ++ || CALL_P (insn)) ++ { ++ size = -1; ++ break; ++ } ++ } ++ ++ if (size == -1 || size > ix86_cost->prefetch_block) ++ { ++ padding_p = false; ++ break; ++ } ++ ++ FOR_EACH_EDGE (e, ei, tbb->succs) ++ { ++ /* It could be part of the loop. */ ++ if (e->dest == bb) ++ { ++ detect_tight_loop_p = true; ++ break; ++ } ++ } ++ ++ if (detect_tight_loop_p) ++ break; ++ ++ end_insn = BB_END (tbb); ++ if (JUMP_P (end_insn)) ++ { ++ /* For decoded icache: ++ 1. Up to two branches are allowed per Way. ++ 2. A non-conditional branch is the last micro-op in a Way. ++ */ ++ if (onlyjump_p (end_insn) ++ && (any_uncondjump_p (end_insn) ++ || single_succ_p (tbb))) ++ { ++ padding_p = false; ++ break; ++ } ++ else if (++cond_branch_num >= 2) ++ { ++ padding_p = false; ++ break; ++ } ++ } ++ ++ } ++ ++ if (padding_p && detect_tight_loop_p) ++ { ++ emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), ++ GEN_INT (0)), label); ++ /* End of function. */ ++ if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) ++ break; ++ /* Skip bb which already fits into one cacheline. */ ++ bb = tbb; ++ } ++ } ++ } ++ ++ loop_optimizer_finalize (); ++ free_dominance_info (CDI_DOMINATORS); ++} ++ ++namespace { ++ ++const pass_data pass_data_align_tight_loops = ++{ ++ RTL_PASS, /* type */ ++ "align_tight_loops", /* name */ ++ OPTGROUP_NONE, /* optinfo_flags */ ++ TV_MACH_DEP, /* tv_id */ ++ 0, /* properties_required */ ++ 0, /* properties_provided */ ++ 0, /* properties_destroyed */ ++ 0, /* todo_flags_start */ ++ 0, /* todo_flags_finish */ ++}; ++ ++class pass_align_tight_loops : public rtl_opt_pass ++{ ++public: ++ pass_align_tight_loops (gcc::context *ctxt) ++ : rtl_opt_pass (pass_data_align_tight_loops, ctxt) ++ {} ++ ++ /* opt_pass methods: */ ++ bool gate (function *) final override ++ { ++ return optimize && optimize_function_for_speed_p (cfun); ++ } ++ ++ unsigned int execute (function *) final override ++ { ++ timevar_push (TV_MACH_DEP); ++#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN ++ ix86_align_loops (); ++#endif ++ timevar_pop (TV_MACH_DEP); ++ return 0; ++ } ++}; // class pass_align_tight_loops ++ ++} // anon namespace ++ ++rtl_opt_pass * ++make_pass_align_tight_loops (gcc::context *ctxt) ++{ ++ return new pass_align_tight_loops (ctxt); ++} ++ + /* This compares the priority of target features in function DECL1 + and DECL2. It returns positive value if DECL1 is higher priority, + negative value if DECL2 is higher priority and 0 if they are the +diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def +index 7d96766f7b96..e500f15c9971 100644 +--- a/gcc/config/i386/i386-passes.def ++++ b/gcc/config/i386/i386-passes.def +@@ -31,5 +31,8 @@ along with GCC; see the file COPYING3. If not see + INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */); + + INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area); ++ /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area. ++ PR116174. */ ++ INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops); + + INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency); +diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h +index 46214a63974d..36c7b1aed42b 100644 +--- a/gcc/config/i386/i386-protos.h ++++ b/gcc/config/i386/i386-protos.h +@@ -419,6 +419,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area + (gcc::context *); + extern rtl_opt_pass *make_pass_remove_partial_avx_dependency + (gcc::context *); ++extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *); + + extern bool ix86_has_no_direct_extern_access; + +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc +index 6f89891d3cb5..288c69467d62 100644 +--- a/gcc/config/i386/i386.cc ++++ b/gcc/config/i386/i386.cc +@@ -23444,150 +23444,6 @@ ix86_split_stlf_stall_load () + } + } + +-/* When a hot loop can be fit into one cacheline, +- force align the loop without considering the max skip. */ +-static void +-ix86_align_loops () +-{ +- basic_block bb; +- +- /* Don't do this when we don't know cache line size. */ +- if (ix86_cost->prefetch_block == 0) +- return; +- +- loop_optimizer_init (AVOID_CFG_MODIFICATIONS); +- profile_count count_threshold = cfun->cfg->count_max / param_align_threshold; +- FOR_EACH_BB_FN (bb, cfun) +- { +- rtx_insn *label = BB_HEAD (bb); +- bool has_fallthru = 0; +- edge e; +- edge_iterator ei; +- +- if (!LABEL_P (label)) +- continue; +- +- profile_count fallthru_count = profile_count::zero (); +- profile_count branch_count = profile_count::zero (); +- +- FOR_EACH_EDGE (e, ei, bb->preds) +- { +- if (e->flags & EDGE_FALLTHRU) +- has_fallthru = 1, fallthru_count += e->count (); +- else +- branch_count += e->count (); +- } +- +- if (!fallthru_count.initialized_p () || !branch_count.initialized_p ()) +- continue; +- +- if (bb->loop_father +- && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun) +- && (has_fallthru +- ? (!(single_succ_p (bb) +- && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun)) +- && optimize_bb_for_speed_p (bb) +- && branch_count + fallthru_count > count_threshold +- && (branch_count > fallthru_count * param_align_loop_iterations)) +- /* In case there'no fallthru for the loop. +- Nops inserted won't be executed. */ +- : (branch_count > count_threshold +- || (bb->count > bb->prev_bb->count * 10 +- && (bb->prev_bb->count +- <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2))))) +- { +- rtx_insn* insn, *end_insn; +- HOST_WIDE_INT size = 0; +- bool padding_p = true; +- basic_block tbb = bb; +- unsigned cond_branch_num = 0; +- bool detect_tight_loop_p = false; +- +- for (unsigned int i = 0; i != bb->loop_father->num_nodes; +- i++, tbb = tbb->next_bb) +- { +- /* Only handle continuous cfg layout. */ +- if (bb->loop_father != tbb->loop_father) +- { +- padding_p = false; +- break; +- } +- +- FOR_BB_INSNS (tbb, insn) +- { +- if (!NONDEBUG_INSN_P (insn)) +- continue; +- size += ix86_min_insn_size (insn); +- +- /* We don't know size of inline asm. +- Don't align loop for call. */ +- if (asm_noperands (PATTERN (insn)) >= 0 +- || CALL_P (insn)) +- { +- size = -1; +- break; +- } +- } +- +- if (size == -1 || size > ix86_cost->prefetch_block) +- { +- padding_p = false; +- break; +- } +- +- FOR_EACH_EDGE (e, ei, tbb->succs) +- { +- /* It could be part of the loop. */ +- if (e->dest == bb) +- { +- detect_tight_loop_p = true; +- break; +- } +- } +- +- if (detect_tight_loop_p) +- break; +- +- end_insn = BB_END (tbb); +- if (JUMP_P (end_insn)) +- { +- /* For decoded icache: +- 1. Up to two branches are allowed per Way. +- 2. A non-conditional branch is the last micro-op in a Way. +- */ +- if (onlyjump_p (end_insn) +- && (any_uncondjump_p (end_insn) +- || single_succ_p (tbb))) +- { +- padding_p = false; +- break; +- } +- else if (++cond_branch_num >= 2) +- { +- padding_p = false; +- break; +- } +- } +- +- } +- +- if (padding_p && detect_tight_loop_p) +- { +- emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)), +- GEN_INT (0)), label); +- /* End of function. */ +- if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun)) +- break; +- /* Skip bb which already fits into one cacheline. */ +- bb = tbb; +- } +- } +- } +- +- loop_optimizer_finalize (); +- free_dominance_info (CDI_DOMINATORS); +-} +- + /* Implement machine specific optimizations. We implement padding of returns + for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window. */ + static void +@@ -23611,8 +23467,6 @@ ix86_reorg (void) + #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN + if (TARGET_FOUR_JUMP_LIMIT) + ix86_avoid_jump_mispredicts (); +- +- ix86_align_loops (); + #endif + } + } +diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c +new file mode 100644 +index 000000000000..8877d0b51af1 +--- /dev/null ++++ b/gcc/testsuite/gcc.target/i386/pr116174.c +@@ -0,0 +1,12 @@ ++/* { dg-do compile { target *-*-linux* } } */ ++/* { dg-options "-O2 -fcf-protection=branch" } */ ++ ++char * ++foo (char *dest, const char *src) ++{ ++ while ((*dest++ = *src++) != '\0') ++ /* nothing */; ++ return --dest; ++} ++ ++/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */ +-- +2.43.5
There is the following warning when building linux-yocto with default configuration on x86-64 with gcc-14.2: AR built-in.a AR vmlinux.a LD vmlinux.o vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to !ENDBR: stpcpy+0x0 This change set removes the warning. PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174] Signed-off-by: Bin Lan <bin.lan.cn@windriver.com> --- meta/recipes-devtools/gcc/gcc-14.2.inc | 1 + ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++ 2 files changed, 448 insertions(+) create mode 100644 meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch