diff mbox series

gcc: backport patch to fix data relocation to !ENDBR: stpcpy

Message ID 20241217104729.4115134-1-bin.lan.cn@windriver.com
State New
Headers show
Series gcc: backport patch to fix data relocation to !ENDBR: stpcpy | expand

Commit Message

Bin Lan Dec. 17, 2024, 10:47 a.m. UTC
There is the following warning when building linux-yocto with
default configuration on x86-64 with gcc-14.2:
  AR      built-in.a
  AR      vmlinux.a
  LD      vmlinux.o
  vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to !ENDBR: stpcpy+0x0

This change set removes the warning.

PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174]

Signed-off-by: Bin Lan <bin.lan.cn@windriver.com>
---
 meta/recipes-devtools/gcc/gcc-14.2.inc        |   1 +
 ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++
 2 files changed, 448 insertions(+)
 create mode 100644 meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch

Comments

Khem Raj Dec. 17, 2024, 5:20 p.m. UTC | #1
On Tue, Dec 17, 2024 at 2:47 AM Bin Lan via lists.openembedded.org
<bin.lan.cn=windriver.com@lists.openembedded.org> wrote:
>
> There is the following warning when building linux-yocto with
> default configuration on x86-64 with gcc-14.2:
>   AR      built-in.a
>   AR      vmlinux.a
>   LD      vmlinux.o
>   vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to !ENDBR: stpcpy+0x0
>
> This change set removes the warning.
>
> PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174]
>
> Signed-off-by: Bin Lan <bin.lan.cn@windriver.com>
> ---
>  meta/recipes-devtools/gcc/gcc-14.2.inc        |   1 +
>  ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++
>  2 files changed, 448 insertions(+)
>  create mode 100644 meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
>
> diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc
> index 4f505bef68..a25bc019e5 100644
> --- a/meta/recipes-devtools/gcc/gcc-14.2.inc
> +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc
> @@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \
>             file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \
>             file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \
>            file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \
> +           file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \

tiny nit.
is this tab vs spaces ? can you fix this.

>             file://gcc.git-ab884fffe3fc82a710bea66ad651720d71c938b8.patch \
>  "
>
> diff --git a/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
> new file mode 100644
> index 0000000000..5bede60816
> --- /dev/null
> +++ b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
> @@ -0,0 +1,447 @@
> +From 4e7735a8d87559bbddfe3a985786996e22241f8d Mon Sep 17 00:00:00 2001
> +From: liuhongt <hongtao.liu@intel.com>
> +Date: Mon, 12 Aug 2024 14:35:31 +0800
> +Subject: [PATCH] Move ix86_align_loops into a separate pass and insert the
> + pass after pass_endbr_and_patchable_area.
> +
> +gcc/ChangeLog:
> +
> +       PR target/116174
> +       * config/i386/i386.cc (ix86_align_loops): Move this to ..
> +       * config/i386/i386-features.cc (ix86_align_loops): .. here.
> +       (class pass_align_tight_loops): New class.
> +       (make_pass_align_tight_loops): New function.
> +       * config/i386/i386-passes.def: Insert pass_align_tight_loops
> +       after pass_insert_endbr_and_patchable_area.
> +       * config/i386/i386-protos.h (make_pass_align_tight_loops): New
> +       declare.
> +
> +gcc/testsuite/ChangeLog:
> +
> +       * gcc.target/i386/pr116174.c: New test.
> +
> +(cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8)
> +
> +Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=patch;h=4e7735a8d87559bbddfe3a985786996e22241f8d]
> +
> +Signed-off-by: Bin Lan <bin.lan.cn@windriver.com>
> +---
> + gcc/config/i386/i386-features.cc         | 191 +++++++++++++++++++++++
> + gcc/config/i386/i386-passes.def          |   3 +
> + gcc/config/i386/i386-protos.h            |   1 +
> + gcc/config/i386/i386.cc                  | 146 -----------------
> + gcc/testsuite/gcc.target/i386/pr116174.c |  12 ++
> + 5 files changed, 207 insertions(+), 146 deletions(-)
> + create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c
> +
> +diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
> +index e3e004d55267..7de19d423637 100644
> +--- a/gcc/config/i386/i386-features.cc
> ++++ b/gcc/config/i386/i386-features.cc
> +@@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
> +   return new pass_remove_partial_avx_dependency (ctxt);
> + }
> +
> ++/* When a hot loop can be fit into one cacheline,
> ++   force align the loop without considering the max skip.  */
> ++static void
> ++ix86_align_loops ()
> ++{
> ++  basic_block bb;
> ++
> ++  /* Don't do this when we don't know cache line size.  */
> ++  if (ix86_cost->prefetch_block == 0)
> ++    return;
> ++
> ++  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
> ++  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
> ++  FOR_EACH_BB_FN (bb, cfun)
> ++    {
> ++      rtx_insn *label = BB_HEAD (bb);
> ++      bool has_fallthru = 0;
> ++      edge e;
> ++      edge_iterator ei;
> ++
> ++      if (!LABEL_P (label))
> ++      continue;
> ++
> ++      profile_count fallthru_count = profile_count::zero ();
> ++      profile_count branch_count = profile_count::zero ();
> ++
> ++      FOR_EACH_EDGE (e, ei, bb->preds)
> ++      {
> ++        if (e->flags & EDGE_FALLTHRU)
> ++          has_fallthru = 1, fallthru_count += e->count ();
> ++        else
> ++          branch_count += e->count ();
> ++      }
> ++
> ++      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
> ++      continue;
> ++
> ++      if (bb->loop_father
> ++        && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
> ++        && (has_fallthru
> ++            ? (!(single_succ_p (bb)
> ++                 && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
> ++               && optimize_bb_for_speed_p (bb)
> ++               && branch_count + fallthru_count > count_threshold
> ++               && (branch_count > fallthru_count * param_align_loop_iterations))
> ++            /* In case there'no fallthru for the loop.
> ++               Nops inserted won't be executed.  */
> ++            : (branch_count > count_threshold
> ++               || (bb->count > bb->prev_bb->count * 10
> ++                   && (bb->prev_bb->count
> ++                       <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
> ++      {
> ++        rtx_insn* insn, *end_insn;
> ++        HOST_WIDE_INT size = 0;
> ++        bool padding_p = true;
> ++        basic_block tbb = bb;
> ++        unsigned cond_branch_num = 0;
> ++        bool detect_tight_loop_p = false;
> ++
> ++        for (unsigned int i = 0; i != bb->loop_father->num_nodes;
> ++             i++, tbb = tbb->next_bb)
> ++          {
> ++            /* Only handle continuous cfg layout. */
> ++            if (bb->loop_father != tbb->loop_father)
> ++              {
> ++                padding_p = false;
> ++                break;
> ++              }
> ++
> ++            FOR_BB_INSNS (tbb, insn)
> ++              {
> ++                if (!NONDEBUG_INSN_P (insn))
> ++                  continue;
> ++                size += ix86_min_insn_size (insn);
> ++
> ++                /* We don't know size of inline asm.
> ++                   Don't align loop for call.  */
> ++                if (asm_noperands (PATTERN (insn)) >= 0
> ++                    || CALL_P (insn))
> ++                  {
> ++                    size = -1;
> ++                    break;
> ++                  }
> ++              }
> ++
> ++            if (size == -1 || size > ix86_cost->prefetch_block)
> ++              {
> ++                padding_p = false;
> ++                break;
> ++              }
> ++
> ++            FOR_EACH_EDGE (e, ei, tbb->succs)
> ++              {
> ++                /* It could be part of the loop.  */
> ++                if (e->dest == bb)
> ++                  {
> ++                    detect_tight_loop_p = true;
> ++                    break;
> ++                  }
> ++              }
> ++
> ++            if (detect_tight_loop_p)
> ++              break;
> ++
> ++            end_insn = BB_END (tbb);
> ++            if (JUMP_P (end_insn))
> ++              {
> ++                /* For decoded icache:
> ++                   1. Up to two branches are allowed per Way.
> ++                   2. A non-conditional branch is the last micro-op in a Way.
> ++                */
> ++                if (onlyjump_p (end_insn)
> ++                    && (any_uncondjump_p (end_insn)
> ++                        || single_succ_p (tbb)))
> ++                  {
> ++                    padding_p = false;
> ++                    break;
> ++                  }
> ++                else if (++cond_branch_num >= 2)
> ++                  {
> ++                    padding_p = false;
> ++                    break;
> ++                  }
> ++              }
> ++
> ++          }
> ++
> ++        if (padding_p && detect_tight_loop_p)
> ++          {
> ++            emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
> ++                                                  GEN_INT (0)), label);
> ++            /* End of function.  */
> ++            if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
> ++              break;
> ++            /* Skip bb which already fits into one cacheline.  */
> ++            bb = tbb;
> ++          }
> ++      }
> ++    }
> ++
> ++  loop_optimizer_finalize ();
> ++  free_dominance_info (CDI_DOMINATORS);
> ++}
> ++
> ++namespace {
> ++
> ++const pass_data pass_data_align_tight_loops =
> ++{
> ++  RTL_PASS, /* type */
> ++  "align_tight_loops", /* name */
> ++  OPTGROUP_NONE, /* optinfo_flags */
> ++  TV_MACH_DEP, /* tv_id */
> ++  0, /* properties_required */
> ++  0, /* properties_provided */
> ++  0, /* properties_destroyed */
> ++  0, /* todo_flags_start */
> ++  0, /* todo_flags_finish */
> ++};
> ++
> ++class pass_align_tight_loops : public rtl_opt_pass
> ++{
> ++public:
> ++  pass_align_tight_loops (gcc::context *ctxt)
> ++    : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
> ++  {}
> ++
> ++  /* opt_pass methods: */
> ++  bool gate (function *) final override
> ++    {
> ++      return optimize && optimize_function_for_speed_p (cfun);
> ++    }
> ++
> ++  unsigned int execute (function *) final override
> ++    {
> ++      timevar_push (TV_MACH_DEP);
> ++#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
> ++      ix86_align_loops ();
> ++#endif
> ++      timevar_pop (TV_MACH_DEP);
> ++      return 0;
> ++    }
> ++}; // class pass_align_tight_loops
> ++
> ++} // anon namespace
> ++
> ++rtl_opt_pass *
> ++make_pass_align_tight_loops (gcc::context *ctxt)
> ++{
> ++  return new pass_align_tight_loops (ctxt);
> ++}
> ++
> + /* This compares the priority of target features in function DECL1
> +    and DECL2.  It returns positive value if DECL1 is higher priority,
> +    negative value if DECL2 is higher priority and 0 if they are the
> +diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
> +index 7d96766f7b96..e500f15c9971 100644
> +--- a/gcc/config/i386/i386-passes.def
> ++++ b/gcc/config/i386/i386-passes.def
> +@@ -31,5 +31,8 @@ along with GCC; see the file COPYING3.  If not see
> +   INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
> +
> +   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
> ++  /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area.
> ++     PR116174.  */
> ++  INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
> +
> +   INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
> +diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> +index 46214a63974d..36c7b1aed42b 100644
> +--- a/gcc/config/i386/i386-protos.h
> ++++ b/gcc/config/i386/i386-protos.h
> +@@ -419,6 +419,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
> +   (gcc::context *);
> + extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
> +   (gcc::context *);
> ++extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
> +
> + extern bool ix86_has_no_direct_extern_access;
> +
> +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> +index 6f89891d3cb5..288c69467d62 100644
> +--- a/gcc/config/i386/i386.cc
> ++++ b/gcc/config/i386/i386.cc
> +@@ -23444,150 +23444,6 @@ ix86_split_stlf_stall_load ()
> +     }
> + }
> +
> +-/* When a hot loop can be fit into one cacheline,
> +-   force align the loop without considering the max skip.  */
> +-static void
> +-ix86_align_loops ()
> +-{
> +-  basic_block bb;
> +-
> +-  /* Don't do this when we don't know cache line size.  */
> +-  if (ix86_cost->prefetch_block == 0)
> +-    return;
> +-
> +-  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
> +-  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
> +-  FOR_EACH_BB_FN (bb, cfun)
> +-    {
> +-      rtx_insn *label = BB_HEAD (bb);
> +-      bool has_fallthru = 0;
> +-      edge e;
> +-      edge_iterator ei;
> +-
> +-      if (!LABEL_P (label))
> +-      continue;
> +-
> +-      profile_count fallthru_count = profile_count::zero ();
> +-      profile_count branch_count = profile_count::zero ();
> +-
> +-      FOR_EACH_EDGE (e, ei, bb->preds)
> +-      {
> +-        if (e->flags & EDGE_FALLTHRU)
> +-          has_fallthru = 1, fallthru_count += e->count ();
> +-        else
> +-          branch_count += e->count ();
> +-      }
> +-
> +-      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
> +-      continue;
> +-
> +-      if (bb->loop_father
> +-        && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
> +-        && (has_fallthru
> +-            ? (!(single_succ_p (bb)
> +-                 && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
> +-               && optimize_bb_for_speed_p (bb)
> +-               && branch_count + fallthru_count > count_threshold
> +-               && (branch_count > fallthru_count * param_align_loop_iterations))
> +-            /* In case there'no fallthru for the loop.
> +-               Nops inserted won't be executed.  */
> +-            : (branch_count > count_threshold
> +-               || (bb->count > bb->prev_bb->count * 10
> +-                   && (bb->prev_bb->count
> +-                       <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
> +-      {
> +-        rtx_insn* insn, *end_insn;
> +-        HOST_WIDE_INT size = 0;
> +-        bool padding_p = true;
> +-        basic_block tbb = bb;
> +-        unsigned cond_branch_num = 0;
> +-        bool detect_tight_loop_p = false;
> +-
> +-        for (unsigned int i = 0; i != bb->loop_father->num_nodes;
> +-             i++, tbb = tbb->next_bb)
> +-          {
> +-            /* Only handle continuous cfg layout. */
> +-            if (bb->loop_father != tbb->loop_father)
> +-              {
> +-                padding_p = false;
> +-                break;
> +-              }
> +-
> +-            FOR_BB_INSNS (tbb, insn)
> +-              {
> +-                if (!NONDEBUG_INSN_P (insn))
> +-                  continue;
> +-                size += ix86_min_insn_size (insn);
> +-
> +-                /* We don't know size of inline asm.
> +-                   Don't align loop for call.  */
> +-                if (asm_noperands (PATTERN (insn)) >= 0
> +-                    || CALL_P (insn))
> +-                  {
> +-                    size = -1;
> +-                    break;
> +-                  }
> +-              }
> +-
> +-            if (size == -1 || size > ix86_cost->prefetch_block)
> +-              {
> +-                padding_p = false;
> +-                break;
> +-              }
> +-
> +-            FOR_EACH_EDGE (e, ei, tbb->succs)
> +-              {
> +-                /* It could be part of the loop.  */
> +-                if (e->dest == bb)
> +-                  {
> +-                    detect_tight_loop_p = true;
> +-                    break;
> +-                  }
> +-              }
> +-
> +-            if (detect_tight_loop_p)
> +-              break;
> +-
> +-            end_insn = BB_END (tbb);
> +-            if (JUMP_P (end_insn))
> +-              {
> +-                /* For decoded icache:
> +-                   1. Up to two branches are allowed per Way.
> +-                   2. A non-conditional branch is the last micro-op in a Way.
> +-                */
> +-                if (onlyjump_p (end_insn)
> +-                    && (any_uncondjump_p (end_insn)
> +-                        || single_succ_p (tbb)))
> +-                  {
> +-                    padding_p = false;
> +-                    break;
> +-                  }
> +-                else if (++cond_branch_num >= 2)
> +-                  {
> +-                    padding_p = false;
> +-                    break;
> +-                  }
> +-              }
> +-
> +-          }
> +-
> +-        if (padding_p && detect_tight_loop_p)
> +-          {
> +-            emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
> +-                                                  GEN_INT (0)), label);
> +-            /* End of function.  */
> +-            if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
> +-              break;
> +-            /* Skip bb which already fits into one cacheline.  */
> +-            bb = tbb;
> +-          }
> +-      }
> +-    }
> +-
> +-  loop_optimizer_finalize ();
> +-  free_dominance_info (CDI_DOMINATORS);
> +-}
> +-
> + /* Implement machine specific optimizations.  We implement padding of returns
> +    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
> + static void
> +@@ -23611,8 +23467,6 @@ ix86_reorg (void)
> + #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
> +       if (TARGET_FOUR_JUMP_LIMIT)
> +       ix86_avoid_jump_mispredicts ();
> +-
> +-      ix86_align_loops ();
> + #endif
> +     }
> + }
> +diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c
> +new file mode 100644
> +index 000000000000..8877d0b51af1
> +--- /dev/null
> ++++ b/gcc/testsuite/gcc.target/i386/pr116174.c
> +@@ -0,0 +1,12 @@
> ++/* { dg-do compile { target *-*-linux* } } */
> ++/* { dg-options "-O2 -fcf-protection=branch" } */
> ++
> ++char *
> ++foo (char *dest, const char *src)
> ++{
> ++  while ((*dest++ = *src++) != '\0')
> ++    /* nothing */;
> ++  return --dest;
> ++}
> ++
> ++/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */
> +--
> +2.43.5
> --
> 2.34.1
>
>
> -=-=-=-=-=-=-=-=-=-=-=-
> Links: You receive all messages sent to this group.
> View/Reply Online (#208831): https://lists.openembedded.org/g/openembedded-core/message/208831
> Mute This Topic: https://lists.openembedded.org/mt/110160747/1997914
> Group Owner: openembedded-core+owner@lists.openembedded.org
> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [raj.khem@gmail.com]
> -=-=-=-=-=-=-=-=-=-=-=-
>
Richard Purdie Dec. 17, 2024, 9:14 p.m. UTC | #2
On Tue, 2024-12-17 at 09:20 -0800, Khem Raj via lists.openembedded.org wrote:
> On Tue, Dec 17, 2024 at 2:47 AM Bin Lan via lists.openembedded.org
> <bin.lan.cn=windriver.com@lists.openembedded.org> wrote:
> > 
> > There is the following warning when building linux-yocto with
> > default configuration on x86-64 with gcc-14.2:
> >   AR      built-in.a
> >   AR      vmlinux.a
> >   LD      vmlinux.o
> >   vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to !ENDBR: stpcpy+0x0
> > 
> > This change set removes the warning.
> > 
> > PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174]
> > 
> > Signed-off-by: Bin Lan <bin.lan.cn@windriver.com>
> > ---
> >  meta/recipes-devtools/gcc/gcc-14.2.inc        |   1 +
> >  ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++
> >  2 files changed, 448 insertions(+)
> >  create mode 100644 meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
> > 
> > diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc
> > index 4f505bef68..a25bc019e5 100644
> > --- a/meta/recipes-devtools/gcc/gcc-14.2.inc
> > +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc
> > @@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \
> >             file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \
> >             file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \
> >            file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \
> > +           file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \
> 
> tiny nit.
> is this tab vs spaces ? can you fix this.

Looks to be in the original file rather than an addition by this patch :/.

Cheers,

Richard
Khem Raj Dec. 17, 2024, 9:19 p.m. UTC | #3
On Tue, Dec 17, 2024 at 1:14 PM Richard Purdie
<richard.purdie@linuxfoundation.org> wrote:
>
> On Tue, 2024-12-17 at 09:20 -0800, Khem Raj via lists.openembedded.org wrote:
> > On Tue, Dec 17, 2024 at 2:47 AM Bin Lan via lists.openembedded.org
> > <bin.lan.cn=windriver.com@lists.openembedded.org> wrote:
> > >
> > > There is the following warning when building linux-yocto with
> > > default configuration on x86-64 with gcc-14.2:
> > >   AR      built-in.a
> > >   AR      vmlinux.a
> > >   LD      vmlinux.o
> > >   vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to !ENDBR: stpcpy+0x0
> > >
> > > This change set removes the warning.
> > >
> > > PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174]
> > >
> > > Signed-off-by: Bin Lan <bin.lan.cn@windriver.com>
> > > ---
> > >  meta/recipes-devtools/gcc/gcc-14.2.inc        |   1 +
> > >  ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++
> > >  2 files changed, 448 insertions(+)
> > >  create mode 100644 meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
> > >
> > > diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc
> > > index 4f505bef68..a25bc019e5 100644
> > > --- a/meta/recipes-devtools/gcc/gcc-14.2.inc
> > > +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc
> > > @@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \
> > >             file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \
> > >             file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \
> > >            file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \
> > > +           file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \
> >
> > tiny nit.
> > is this tab vs spaces ? can you fix this.
>
> Looks to be in the original file rather than an addition by this patch :/.

sadly yes.

>
> Cheers,
>
> Richard
Bin Lan Dec. 18, 2024, 1:31 a.m. UTC | #4
On 12/18/24 01:20, Khem Raj wrote:
> CAUTION: This email comes from a non Wind River email account!
> Do not click links or open attachments unless you recognize the sender and know the content is safe.
>
> On Tue, Dec 17, 2024 at 2:47 AM Bin Lan via lists.openembedded.org
> <bin.lan.cn=windriver.com@lists.openembedded.org> wrote:
>> There is the following warning when building linux-yocto with
>> default configuration on x86-64 with gcc-14.2:
>>    AR      built-in.a
>>    AR      vmlinux.a
>>    LD      vmlinux.o
>>    vmlinux.o: warning: objtool: .export_symbol+0x332a0: data relocation to !ENDBR: stpcpy+0x0
>>
>> This change set removes the warning.
>>
>> PR target/116174 [https://gcc.gnu.org/bugzilla/show_bug.cgi?id=116174]
>>
>> Signed-off-by: Bin Lan <bin.lan.cn@windriver.com>
>> ---
>>   meta/recipes-devtools/gcc/gcc-14.2.inc        |   1 +
>>   ...ch-to-fix-data-relocation-to-ENDBR-s.patch | 447 ++++++++++++++++++
>>   2 files changed, 448 insertions(+)
>>   create mode 100644 meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
>>
>> diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc
>> index 4f505bef68..a25bc019e5 100644
>> --- a/meta/recipes-devtools/gcc/gcc-14.2.inc
>> +++ b/meta/recipes-devtools/gcc/gcc-14.2.inc
>> @@ -69,6 +69,7 @@ SRC_URI = "${BASEURI} \
>>              file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \
>>              file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \
>>             file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \
>> +           file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \
> tiny nit.
> is this tab vs spaces ? can you fix this.

I have fixed it and the v2 patch is sent out.

//Bin Lan

>
>>              file://gcc.git-ab884fffe3fc82a710bea66ad651720d71c938b8.patch \
>>   "
>>
>> diff --git a/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
>> new file mode 100644
>> index 0000000000..5bede60816
>> --- /dev/null
>> +++ b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
>> @@ -0,0 +1,447 @@
>> +From 4e7735a8d87559bbddfe3a985786996e22241f8d Mon Sep 17 00:00:00 2001
>> +From: liuhongt <hongtao.liu@intel.com>
>> +Date: Mon, 12 Aug 2024 14:35:31 +0800
>> +Subject: [PATCH] Move ix86_align_loops into a separate pass and insert the
>> + pass after pass_endbr_and_patchable_area.
>> +
>> +gcc/ChangeLog:
>> +
>> +       PR target/116174
>> +       * config/i386/i386.cc (ix86_align_loops): Move this to ..
>> +       * config/i386/i386-features.cc (ix86_align_loops): .. here.
>> +       (class pass_align_tight_loops): New class.
>> +       (make_pass_align_tight_loops): New function.
>> +       * config/i386/i386-passes.def: Insert pass_align_tight_loops
>> +       after pass_insert_endbr_and_patchable_area.
>> +       * config/i386/i386-protos.h (make_pass_align_tight_loops): New
>> +       declare.
>> +
>> +gcc/testsuite/ChangeLog:
>> +
>> +       * gcc.target/i386/pr116174.c: New test.
>> +
>> +(cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8)
>> +
>> +Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=patch;h=4e7735a8d87559bbddfe3a985786996e22241f8d]
>> +
>> +Signed-off-by: Bin Lan <bin.lan.cn@windriver.com>
>> +---
>> + gcc/config/i386/i386-features.cc         | 191 +++++++++++++++++++++++
>> + gcc/config/i386/i386-passes.def          |   3 +
>> + gcc/config/i386/i386-protos.h            |   1 +
>> + gcc/config/i386/i386.cc                  | 146 -----------------
>> + gcc/testsuite/gcc.target/i386/pr116174.c |  12 ++
>> + 5 files changed, 207 insertions(+), 146 deletions(-)
>> + create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c
>> +
>> +diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
>> +index e3e004d55267..7de19d423637 100644
>> +--- a/gcc/config/i386/i386-features.cc
>> ++++ b/gcc/config/i386/i386-features.cc
>> +@@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
>> +   return new pass_remove_partial_avx_dependency (ctxt);
>> + }
>> +
>> ++/* When a hot loop can be fit into one cacheline,
>> ++   force align the loop without considering the max skip.  */
>> ++static void
>> ++ix86_align_loops ()
>> ++{
>> ++  basic_block bb;
>> ++
>> ++  /* Don't do this when we don't know cache line size.  */
>> ++  if (ix86_cost->prefetch_block == 0)
>> ++    return;
>> ++
>> ++  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
>> ++  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
>> ++  FOR_EACH_BB_FN (bb, cfun)
>> ++    {
>> ++      rtx_insn *label = BB_HEAD (bb);
>> ++      bool has_fallthru = 0;
>> ++      edge e;
>> ++      edge_iterator ei;
>> ++
>> ++      if (!LABEL_P (label))
>> ++      continue;
>> ++
>> ++      profile_count fallthru_count = profile_count::zero ();
>> ++      profile_count branch_count = profile_count::zero ();
>> ++
>> ++      FOR_EACH_EDGE (e, ei, bb->preds)
>> ++      {
>> ++        if (e->flags & EDGE_FALLTHRU)
>> ++          has_fallthru = 1, fallthru_count += e->count ();
>> ++        else
>> ++          branch_count += e->count ();
>> ++      }
>> ++
>> ++      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
>> ++      continue;
>> ++
>> ++      if (bb->loop_father
>> ++        && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
>> ++        && (has_fallthru
>> ++            ? (!(single_succ_p (bb)
>> ++                 && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
>> ++               && optimize_bb_for_speed_p (bb)
>> ++               && branch_count + fallthru_count > count_threshold
>> ++               && (branch_count > fallthru_count * param_align_loop_iterations))
>> ++            /* In case there'no fallthru for the loop.
>> ++               Nops inserted won't be executed.  */
>> ++            : (branch_count > count_threshold
>> ++               || (bb->count > bb->prev_bb->count * 10
>> ++                   && (bb->prev_bb->count
>> ++                       <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
>> ++      {
>> ++        rtx_insn* insn, *end_insn;
>> ++        HOST_WIDE_INT size = 0;
>> ++        bool padding_p = true;
>> ++        basic_block tbb = bb;
>> ++        unsigned cond_branch_num = 0;
>> ++        bool detect_tight_loop_p = false;
>> ++
>> ++        for (unsigned int i = 0; i != bb->loop_father->num_nodes;
>> ++             i++, tbb = tbb->next_bb)
>> ++          {
>> ++            /* Only handle continuous cfg layout. */
>> ++            if (bb->loop_father != tbb->loop_father)
>> ++              {
>> ++                padding_p = false;
>> ++                break;
>> ++              }
>> ++
>> ++            FOR_BB_INSNS (tbb, insn)
>> ++              {
>> ++                if (!NONDEBUG_INSN_P (insn))
>> ++                  continue;
>> ++                size += ix86_min_insn_size (insn);
>> ++
>> ++                /* We don't know size of inline asm.
>> ++                   Don't align loop for call.  */
>> ++                if (asm_noperands (PATTERN (insn)) >= 0
>> ++                    || CALL_P (insn))
>> ++                  {
>> ++                    size = -1;
>> ++                    break;
>> ++                  }
>> ++              }
>> ++
>> ++            if (size == -1 || size > ix86_cost->prefetch_block)
>> ++              {
>> ++                padding_p = false;
>> ++                break;
>> ++              }
>> ++
>> ++            FOR_EACH_EDGE (e, ei, tbb->succs)
>> ++              {
>> ++                /* It could be part of the loop.  */
>> ++                if (e->dest == bb)
>> ++                  {
>> ++                    detect_tight_loop_p = true;
>> ++                    break;
>> ++                  }
>> ++              }
>> ++
>> ++            if (detect_tight_loop_p)
>> ++              break;
>> ++
>> ++            end_insn = BB_END (tbb);
>> ++            if (JUMP_P (end_insn))
>> ++              {
>> ++                /* For decoded icache:
>> ++                   1. Up to two branches are allowed per Way.
>> ++                   2. A non-conditional branch is the last micro-op in a Way.
>> ++                */
>> ++                if (onlyjump_p (end_insn)
>> ++                    && (any_uncondjump_p (end_insn)
>> ++                        || single_succ_p (tbb)))
>> ++                  {
>> ++                    padding_p = false;
>> ++                    break;
>> ++                  }
>> ++                else if (++cond_branch_num >= 2)
>> ++                  {
>> ++                    padding_p = false;
>> ++                    break;
>> ++                  }
>> ++              }
>> ++
>> ++          }
>> ++
>> ++        if (padding_p && detect_tight_loop_p)
>> ++          {
>> ++            emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
>> ++                                                  GEN_INT (0)), label);
>> ++            /* End of function.  */
>> ++            if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
>> ++              break;
>> ++            /* Skip bb which already fits into one cacheline.  */
>> ++            bb = tbb;
>> ++          }
>> ++      }
>> ++    }
>> ++
>> ++  loop_optimizer_finalize ();
>> ++  free_dominance_info (CDI_DOMINATORS);
>> ++}
>> ++
>> ++namespace {
>> ++
>> ++const pass_data pass_data_align_tight_loops =
>> ++{
>> ++  RTL_PASS, /* type */
>> ++  "align_tight_loops", /* name */
>> ++  OPTGROUP_NONE, /* optinfo_flags */
>> ++  TV_MACH_DEP, /* tv_id */
>> ++  0, /* properties_required */
>> ++  0, /* properties_provided */
>> ++  0, /* properties_destroyed */
>> ++  0, /* todo_flags_start */
>> ++  0, /* todo_flags_finish */
>> ++};
>> ++
>> ++class pass_align_tight_loops : public rtl_opt_pass
>> ++{
>> ++public:
>> ++  pass_align_tight_loops (gcc::context *ctxt)
>> ++    : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
>> ++  {}
>> ++
>> ++  /* opt_pass methods: */
>> ++  bool gate (function *) final override
>> ++    {
>> ++      return optimize && optimize_function_for_speed_p (cfun);
>> ++    }
>> ++
>> ++  unsigned int execute (function *) final override
>> ++    {
>> ++      timevar_push (TV_MACH_DEP);
>> ++#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
>> ++      ix86_align_loops ();
>> ++#endif
>> ++      timevar_pop (TV_MACH_DEP);
>> ++      return 0;
>> ++    }
>> ++}; // class pass_align_tight_loops
>> ++
>> ++} // anon namespace
>> ++
>> ++rtl_opt_pass *
>> ++make_pass_align_tight_loops (gcc::context *ctxt)
>> ++{
>> ++  return new pass_align_tight_loops (ctxt);
>> ++}
>> ++
>> + /* This compares the priority of target features in function DECL1
>> +    and DECL2.  It returns positive value if DECL1 is higher priority,
>> +    negative value if DECL2 is higher priority and 0 if they are the
>> +diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
>> +index 7d96766f7b96..e500f15c9971 100644
>> +--- a/gcc/config/i386/i386-passes.def
>> ++++ b/gcc/config/i386/i386-passes.def
>> +@@ -31,5 +31,8 @@ along with GCC; see the file COPYING3.  If not see
>> +   INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
>> +
>> +   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
>> ++  /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area.
>> ++     PR116174.  */
>> ++  INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
>> +
>> +   INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
>> +diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
>> +index 46214a63974d..36c7b1aed42b 100644
>> +--- a/gcc/config/i386/i386-protos.h
>> ++++ b/gcc/config/i386/i386-protos.h
>> +@@ -419,6 +419,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
>> +   (gcc::context *);
>> + extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
>> +   (gcc::context *);
>> ++extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
>> +
>> + extern bool ix86_has_no_direct_extern_access;
>> +
>> +diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
>> +index 6f89891d3cb5..288c69467d62 100644
>> +--- a/gcc/config/i386/i386.cc
>> ++++ b/gcc/config/i386/i386.cc
>> +@@ -23444,150 +23444,6 @@ ix86_split_stlf_stall_load ()
>> +     }
>> + }
>> +
>> +-/* When a hot loop can be fit into one cacheline,
>> +-   force align the loop without considering the max skip.  */
>> +-static void
>> +-ix86_align_loops ()
>> +-{
>> +-  basic_block bb;
>> +-
>> +-  /* Don't do this when we don't know cache line size.  */
>> +-  if (ix86_cost->prefetch_block == 0)
>> +-    return;
>> +-
>> +-  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
>> +-  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
>> +-  FOR_EACH_BB_FN (bb, cfun)
>> +-    {
>> +-      rtx_insn *label = BB_HEAD (bb);
>> +-      bool has_fallthru = 0;
>> +-      edge e;
>> +-      edge_iterator ei;
>> +-
>> +-      if (!LABEL_P (label))
>> +-      continue;
>> +-
>> +-      profile_count fallthru_count = profile_count::zero ();
>> +-      profile_count branch_count = profile_count::zero ();
>> +-
>> +-      FOR_EACH_EDGE (e, ei, bb->preds)
>> +-      {
>> +-        if (e->flags & EDGE_FALLTHRU)
>> +-          has_fallthru = 1, fallthru_count += e->count ();
>> +-        else
>> +-          branch_count += e->count ();
>> +-      }
>> +-
>> +-      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
>> +-      continue;
>> +-
>> +-      if (bb->loop_father
>> +-        && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
>> +-        && (has_fallthru
>> +-            ? (!(single_succ_p (bb)
>> +-                 && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
>> +-               && optimize_bb_for_speed_p (bb)
>> +-               && branch_count + fallthru_count > count_threshold
>> +-               && (branch_count > fallthru_count * param_align_loop_iterations))
>> +-            /* In case there'no fallthru for the loop.
>> +-               Nops inserted won't be executed.  */
>> +-            : (branch_count > count_threshold
>> +-               || (bb->count > bb->prev_bb->count * 10
>> +-                   && (bb->prev_bb->count
>> +-                       <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
>> +-      {
>> +-        rtx_insn* insn, *end_insn;
>> +-        HOST_WIDE_INT size = 0;
>> +-        bool padding_p = true;
>> +-        basic_block tbb = bb;
>> +-        unsigned cond_branch_num = 0;
>> +-        bool detect_tight_loop_p = false;
>> +-
>> +-        for (unsigned int i = 0; i != bb->loop_father->num_nodes;
>> +-             i++, tbb = tbb->next_bb)
>> +-          {
>> +-            /* Only handle continuous cfg layout. */
>> +-            if (bb->loop_father != tbb->loop_father)
>> +-              {
>> +-                padding_p = false;
>> +-                break;
>> +-              }
>> +-
>> +-            FOR_BB_INSNS (tbb, insn)
>> +-              {
>> +-                if (!NONDEBUG_INSN_P (insn))
>> +-                  continue;
>> +-                size += ix86_min_insn_size (insn);
>> +-
>> +-                /* We don't know size of inline asm.
>> +-                   Don't align loop for call.  */
>> +-                if (asm_noperands (PATTERN (insn)) >= 0
>> +-                    || CALL_P (insn))
>> +-                  {
>> +-                    size = -1;
>> +-                    break;
>> +-                  }
>> +-              }
>> +-
>> +-            if (size == -1 || size > ix86_cost->prefetch_block)
>> +-              {
>> +-                padding_p = false;
>> +-                break;
>> +-              }
>> +-
>> +-            FOR_EACH_EDGE (e, ei, tbb->succs)
>> +-              {
>> +-                /* It could be part of the loop.  */
>> +-                if (e->dest == bb)
>> +-                  {
>> +-                    detect_tight_loop_p = true;
>> +-                    break;
>> +-                  }
>> +-              }
>> +-
>> +-            if (detect_tight_loop_p)
>> +-              break;
>> +-
>> +-            end_insn = BB_END (tbb);
>> +-            if (JUMP_P (end_insn))
>> +-              {
>> +-                /* For decoded icache:
>> +-                   1. Up to two branches are allowed per Way.
>> +-                   2. A non-conditional branch is the last micro-op in a Way.
>> +-                */
>> +-                if (onlyjump_p (end_insn)
>> +-                    && (any_uncondjump_p (end_insn)
>> +-                        || single_succ_p (tbb)))
>> +-                  {
>> +-                    padding_p = false;
>> +-                    break;
>> +-                  }
>> +-                else if (++cond_branch_num >= 2)
>> +-                  {
>> +-                    padding_p = false;
>> +-                    break;
>> +-                  }
>> +-              }
>> +-
>> +-          }
>> +-
>> +-        if (padding_p && detect_tight_loop_p)
>> +-          {
>> +-            emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
>> +-                                                  GEN_INT (0)), label);
>> +-            /* End of function.  */
>> +-            if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
>> +-              break;
>> +-            /* Skip bb which already fits into one cacheline.  */
>> +-            bb = tbb;
>> +-          }
>> +-      }
>> +-    }
>> +-
>> +-  loop_optimizer_finalize ();
>> +-  free_dominance_info (CDI_DOMINATORS);
>> +-}
>> +-
>> + /* Implement machine specific optimizations.  We implement padding of returns
>> +    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
>> + static void
>> +@@ -23611,8 +23467,6 @@ ix86_reorg (void)
>> + #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
>> +       if (TARGET_FOUR_JUMP_LIMIT)
>> +       ix86_avoid_jump_mispredicts ();
>> +-
>> +-      ix86_align_loops ();
>> + #endif
>> +     }
>> + }
>> +diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c
>> +new file mode 100644
>> +index 000000000000..8877d0b51af1
>> +--- /dev/null
>> ++++ b/gcc/testsuite/gcc.target/i386/pr116174.c
>> +@@ -0,0 +1,12 @@
>> ++/* { dg-do compile { target *-*-linux* } } */
>> ++/* { dg-options "-O2 -fcf-protection=branch" } */
>> ++
>> ++char *
>> ++foo (char *dest, const char *src)
>> ++{
>> ++  while ((*dest++ = *src++) != '\0')
>> ++    /* nothing */;
>> ++  return --dest;
>> ++}
>> ++
>> ++/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */
>> +--
>> +2.43.5
>> --
>> 2.34.1
>>
>>
>> -=-=-=-=-=-=-=-=-=-=-=-
>> Links: You receive all messages sent to this group.
>> View/Reply Online (#208831): https://lists.openembedded.org/g/openembedded-core/message/208831
>> Mute This Topic: https://lists.openembedded.org/mt/110160747/1997914
>> Group Owner: openembedded-core+owner@lists.openembedded.org
>> Unsubscribe: https://lists.openembedded.org/g/openembedded-core/unsub [raj.khem@gmail.com]
>> -=-=-=-=-=-=-=-=-=-=-=-
>>
diff mbox series

Patch

diff --git a/meta/recipes-devtools/gcc/gcc-14.2.inc b/meta/recipes-devtools/gcc/gcc-14.2.inc
index 4f505bef68..a25bc019e5 100644
--- a/meta/recipes-devtools/gcc/gcc-14.2.inc
+++ b/meta/recipes-devtools/gcc/gcc-14.2.inc
@@ -69,6 +69,7 @@  SRC_URI = "${BASEURI} \
            file://0024-Avoid-hardcoded-build-paths-into-ppc-libgcc.patch \
            file://0025-gcc-testsuite-tweaks-for-mips-OE.patch \
 	   file://0026-gcc-Fix-c-tweak-for-Wrange-loop-construct.patch \
+           file://0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch \
            file://gcc.git-ab884fffe3fc82a710bea66ad651720d71c938b8.patch \
 "
 
diff --git a/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
new file mode 100644
index 0000000000..5bede60816
--- /dev/null
+++ b/meta/recipes-devtools/gcc/gcc/0027-gcc-backport-patch-to-fix-data-relocation-to-ENDBR-s.patch
@@ -0,0 +1,447 @@ 
+From 4e7735a8d87559bbddfe3a985786996e22241f8d Mon Sep 17 00:00:00 2001
+From: liuhongt <hongtao.liu@intel.com>
+Date: Mon, 12 Aug 2024 14:35:31 +0800
+Subject: [PATCH] Move ix86_align_loops into a separate pass and insert the
+ pass after pass_endbr_and_patchable_area.
+
+gcc/ChangeLog:
+
+	PR target/116174
+	* config/i386/i386.cc (ix86_align_loops): Move this to ..
+	* config/i386/i386-features.cc (ix86_align_loops): .. here.
+	(class pass_align_tight_loops): New class.
+	(make_pass_align_tight_loops): New function.
+	* config/i386/i386-passes.def: Insert pass_align_tight_loops
+	after pass_insert_endbr_and_patchable_area.
+	* config/i386/i386-protos.h (make_pass_align_tight_loops): New
+	declare.
+
+gcc/testsuite/ChangeLog:
+
+	* gcc.target/i386/pr116174.c: New test.
+
+(cherry picked from commit c3c83d22d212a35cb1bfb8727477819463f0dcd8)
+
+Upstream-Status: Backport [https://gcc.gnu.org/git/?p=gcc.git;a=patch;h=4e7735a8d87559bbddfe3a985786996e22241f8d]
+
+Signed-off-by: Bin Lan <bin.lan.cn@windriver.com>
+---
+ gcc/config/i386/i386-features.cc         | 191 +++++++++++++++++++++++
+ gcc/config/i386/i386-passes.def          |   3 +
+ gcc/config/i386/i386-protos.h            |   1 +
+ gcc/config/i386/i386.cc                  | 146 -----------------
+ gcc/testsuite/gcc.target/i386/pr116174.c |  12 ++
+ 5 files changed, 207 insertions(+), 146 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/i386/pr116174.c
+
+diff --git a/gcc/config/i386/i386-features.cc b/gcc/config/i386/i386-features.cc
+index e3e004d55267..7de19d423637 100644
+--- a/gcc/config/i386/i386-features.cc
++++ b/gcc/config/i386/i386-features.cc
+@@ -3253,6 +3253,197 @@ make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
+   return new pass_remove_partial_avx_dependency (ctxt);
+ }
+ 
++/* When a hot loop can be fit into one cacheline,
++   force align the loop without considering the max skip.  */
++static void
++ix86_align_loops ()
++{
++  basic_block bb;
++
++  /* Don't do this when we don't know cache line size.  */
++  if (ix86_cost->prefetch_block == 0)
++    return;
++
++  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
++  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
++  FOR_EACH_BB_FN (bb, cfun)
++    {
++      rtx_insn *label = BB_HEAD (bb);
++      bool has_fallthru = 0;
++      edge e;
++      edge_iterator ei;
++
++      if (!LABEL_P (label))
++	continue;
++
++      profile_count fallthru_count = profile_count::zero ();
++      profile_count branch_count = profile_count::zero ();
++
++      FOR_EACH_EDGE (e, ei, bb->preds)
++	{
++	  if (e->flags & EDGE_FALLTHRU)
++	    has_fallthru = 1, fallthru_count += e->count ();
++	  else
++	    branch_count += e->count ();
++	}
++
++      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
++	continue;
++
++      if (bb->loop_father
++	  && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
++	  && (has_fallthru
++	      ? (!(single_succ_p (bb)
++		   && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
++		 && optimize_bb_for_speed_p (bb)
++		 && branch_count + fallthru_count > count_threshold
++		 && (branch_count > fallthru_count * param_align_loop_iterations))
++	      /* In case there'no fallthru for the loop.
++		 Nops inserted won't be executed.  */
++	      : (branch_count > count_threshold
++		 || (bb->count > bb->prev_bb->count * 10
++		     && (bb->prev_bb->count
++			 <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
++	{
++	  rtx_insn* insn, *end_insn;
++	  HOST_WIDE_INT size = 0;
++	  bool padding_p = true;
++	  basic_block tbb = bb;
++	  unsigned cond_branch_num = 0;
++	  bool detect_tight_loop_p = false;
++
++	  for (unsigned int i = 0; i != bb->loop_father->num_nodes;
++	       i++, tbb = tbb->next_bb)
++	    {
++	      /* Only handle continuous cfg layout. */
++	      if (bb->loop_father != tbb->loop_father)
++		{
++		  padding_p = false;
++		  break;
++		}
++
++	      FOR_BB_INSNS (tbb, insn)
++		{
++		  if (!NONDEBUG_INSN_P (insn))
++		    continue;
++		  size += ix86_min_insn_size (insn);
++
++		  /* We don't know size of inline asm.
++		     Don't align loop for call.  */
++		  if (asm_noperands (PATTERN (insn)) >= 0
++		      || CALL_P (insn))
++		    {
++		      size = -1;
++		      break;
++		    }
++		}
++
++	      if (size == -1 || size > ix86_cost->prefetch_block)
++		{
++		  padding_p = false;
++		  break;
++		}
++
++	      FOR_EACH_EDGE (e, ei, tbb->succs)
++		{
++		  /* It could be part of the loop.  */
++		  if (e->dest == bb)
++		    {
++		      detect_tight_loop_p = true;
++		      break;
++		    }
++		}
++
++	      if (detect_tight_loop_p)
++		break;
++
++	      end_insn = BB_END (tbb);
++	      if (JUMP_P (end_insn))
++		{
++		  /* For decoded icache:
++		     1. Up to two branches are allowed per Way.
++		     2. A non-conditional branch is the last micro-op in a Way.
++		  */
++		  if (onlyjump_p (end_insn)
++		      && (any_uncondjump_p (end_insn)
++			  || single_succ_p (tbb)))
++		    {
++		      padding_p = false;
++		      break;
++		    }
++		  else if (++cond_branch_num >= 2)
++		    {
++		      padding_p = false;
++		      break;
++		    }
++		}
++
++	    }
++
++	  if (padding_p && detect_tight_loop_p)
++	    {
++	      emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
++						    GEN_INT (0)), label);
++	      /* End of function.  */
++	      if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
++		break;
++	      /* Skip bb which already fits into one cacheline.  */
++	      bb = tbb;
++	    }
++	}
++    }
++
++  loop_optimizer_finalize ();
++  free_dominance_info (CDI_DOMINATORS);
++}
++
++namespace {
++
++const pass_data pass_data_align_tight_loops =
++{
++  RTL_PASS, /* type */
++  "align_tight_loops", /* name */
++  OPTGROUP_NONE, /* optinfo_flags */
++  TV_MACH_DEP, /* tv_id */
++  0, /* properties_required */
++  0, /* properties_provided */
++  0, /* properties_destroyed */
++  0, /* todo_flags_start */
++  0, /* todo_flags_finish */
++};
++
++class pass_align_tight_loops : public rtl_opt_pass
++{
++public:
++  pass_align_tight_loops (gcc::context *ctxt)
++    : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
++  {}
++
++  /* opt_pass methods: */
++  bool gate (function *) final override
++    {
++      return optimize && optimize_function_for_speed_p (cfun);
++    }
++
++  unsigned int execute (function *) final override
++    {
++      timevar_push (TV_MACH_DEP);
++#ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
++      ix86_align_loops ();
++#endif
++      timevar_pop (TV_MACH_DEP);
++      return 0;
++    }
++}; // class pass_align_tight_loops
++
++} // anon namespace
++
++rtl_opt_pass *
++make_pass_align_tight_loops (gcc::context *ctxt)
++{
++  return new pass_align_tight_loops (ctxt);
++}
++
+ /* This compares the priority of target features in function DECL1
+    and DECL2.  It returns positive value if DECL1 is higher priority,
+    negative value if DECL2 is higher priority and 0 if they are the
+diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
+index 7d96766f7b96..e500f15c9971 100644
+--- a/gcc/config/i386/i386-passes.def
++++ b/gcc/config/i386/i386-passes.def
+@@ -31,5 +31,8 @@ along with GCC; see the file COPYING3.  If not see
+   INSERT_PASS_BEFORE (pass_cse2, 1, pass_stv, true /* timode_p */);
+ 
+   INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_insert_endbr_and_patchable_area);
++  /* pass_align_tight_loops must be after pass_insert_endbr_and_patchable_area.
++     PR116174.  */
++  INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
+ 
+   INSERT_PASS_AFTER (pass_combine, 1, pass_remove_partial_avx_dependency);
+diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
+index 46214a63974d..36c7b1aed42b 100644
+--- a/gcc/config/i386/i386-protos.h
++++ b/gcc/config/i386/i386-protos.h
+@@ -419,6 +419,7 @@ extern rtl_opt_pass *make_pass_insert_endbr_and_patchable_area
+   (gcc::context *);
+ extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
+   (gcc::context *);
++extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
+ 
+ extern bool ix86_has_no_direct_extern_access;
+ 
+diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
+index 6f89891d3cb5..288c69467d62 100644
+--- a/gcc/config/i386/i386.cc
++++ b/gcc/config/i386/i386.cc
+@@ -23444,150 +23444,6 @@ ix86_split_stlf_stall_load ()
+     }
+ }
+ 
+-/* When a hot loop can be fit into one cacheline,
+-   force align the loop without considering the max skip.  */
+-static void
+-ix86_align_loops ()
+-{
+-  basic_block bb;
+-
+-  /* Don't do this when we don't know cache line size.  */
+-  if (ix86_cost->prefetch_block == 0)
+-    return;
+-
+-  loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
+-  profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
+-  FOR_EACH_BB_FN (bb, cfun)
+-    {
+-      rtx_insn *label = BB_HEAD (bb);
+-      bool has_fallthru = 0;
+-      edge e;
+-      edge_iterator ei;
+-
+-      if (!LABEL_P (label))
+-	continue;
+-
+-      profile_count fallthru_count = profile_count::zero ();
+-      profile_count branch_count = profile_count::zero ();
+-
+-      FOR_EACH_EDGE (e, ei, bb->preds)
+-	{
+-	  if (e->flags & EDGE_FALLTHRU)
+-	    has_fallthru = 1, fallthru_count += e->count ();
+-	  else
+-	    branch_count += e->count ();
+-	}
+-
+-      if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
+-	continue;
+-
+-      if (bb->loop_father
+-	  && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
+-	  && (has_fallthru
+-	      ? (!(single_succ_p (bb)
+-		   && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
+-		 && optimize_bb_for_speed_p (bb)
+-		 && branch_count + fallthru_count > count_threshold
+-		 && (branch_count > fallthru_count * param_align_loop_iterations))
+-	      /* In case there'no fallthru for the loop.
+-		 Nops inserted won't be executed.  */
+-	      : (branch_count > count_threshold
+-		 || (bb->count > bb->prev_bb->count * 10
+-		     && (bb->prev_bb->count
+-			 <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
+-	{
+-	  rtx_insn* insn, *end_insn;
+-	  HOST_WIDE_INT size = 0;
+-	  bool padding_p = true;
+-	  basic_block tbb = bb;
+-	  unsigned cond_branch_num = 0;
+-	  bool detect_tight_loop_p = false;
+-
+-	  for (unsigned int i = 0; i != bb->loop_father->num_nodes;
+-	       i++, tbb = tbb->next_bb)
+-	    {
+-	      /* Only handle continuous cfg layout. */
+-	      if (bb->loop_father != tbb->loop_father)
+-		{
+-		  padding_p = false;
+-		  break;
+-		}
+-
+-	      FOR_BB_INSNS (tbb, insn)
+-		{
+-		  if (!NONDEBUG_INSN_P (insn))
+-		    continue;
+-		  size += ix86_min_insn_size (insn);
+-
+-		  /* We don't know size of inline asm.
+-		     Don't align loop for call.  */
+-		  if (asm_noperands (PATTERN (insn)) >= 0
+-		      || CALL_P (insn))
+-		    {
+-		      size = -1;
+-		      break;
+-		    }
+-		}
+-
+-	      if (size == -1 || size > ix86_cost->prefetch_block)
+-		{
+-		  padding_p = false;
+-		  break;
+-		}
+-
+-	      FOR_EACH_EDGE (e, ei, tbb->succs)
+-		{
+-		  /* It could be part of the loop.  */
+-		  if (e->dest == bb)
+-		    {
+-		      detect_tight_loop_p = true;
+-		      break;
+-		    }
+-		}
+-
+-	      if (detect_tight_loop_p)
+-		break;
+-
+-	      end_insn = BB_END (tbb);
+-	      if (JUMP_P (end_insn))
+-		{
+-		  /* For decoded icache:
+-		     1. Up to two branches are allowed per Way.
+-		     2. A non-conditional branch is the last micro-op in a Way.
+-		  */
+-		  if (onlyjump_p (end_insn)
+-		      && (any_uncondjump_p (end_insn)
+-			  || single_succ_p (tbb)))
+-		    {
+-		      padding_p = false;
+-		      break;
+-		    }
+-		  else if (++cond_branch_num >= 2)
+-		    {
+-		      padding_p = false;
+-		      break;
+-		    }
+-		}
+-
+-	    }
+-
+-	  if (padding_p && detect_tight_loop_p)
+-	    {
+-	      emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
+-						    GEN_INT (0)), label);
+-	      /* End of function.  */
+-	      if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
+-		break;
+-	      /* Skip bb which already fits into one cacheline.  */
+-	      bb = tbb;
+-	    }
+-	}
+-    }
+-
+-  loop_optimizer_finalize ();
+-  free_dominance_info (CDI_DOMINATORS);
+-}
+-
+ /* Implement machine specific optimizations.  We implement padding of returns
+    for K8 CPUs and pass to avoid 4 jumps in the single 16 byte window.  */
+ static void
+@@ -23611,8 +23467,6 @@ ix86_reorg (void)
+ #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
+       if (TARGET_FOUR_JUMP_LIMIT)
+ 	ix86_avoid_jump_mispredicts ();
+-
+-      ix86_align_loops ();
+ #endif
+     }
+ }
+diff --git a/gcc/testsuite/gcc.target/i386/pr116174.c b/gcc/testsuite/gcc.target/i386/pr116174.c
+new file mode 100644
+index 000000000000..8877d0b51af1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/i386/pr116174.c
+@@ -0,0 +1,12 @@
++/* { dg-do compile { target *-*-linux* } } */
++/* { dg-options "-O2 -fcf-protection=branch" } */
++
++char *
++foo (char *dest, const char *src)
++{
++  while ((*dest++ = *src++) != '\0')
++    /* nothing */;
++  return --dest;
++}
++
++/* { dg-final { scan-assembler "\t\.cfi_startproc\n\tendbr(32|64)\n" } } */
+-- 
+2.43.5