summaryrefslogtreecommitdiff
path: root/resources/coreboot/haswell/patches/0021-haswell-NRI-Add-RcvEn-training.patch
diff options
context:
space:
mode:
authorLeah Rowe <leah@libreboot.org>2023-03-18 00:36:27 +0000
committerLeah Rowe <leah@libreboot.org>2023-03-18 00:55:10 +0000
commit548872ce8e84fe10d52417acab9b3cf886821386 (patch)
treeb549ad51c9c693a323f0f8edc9a11eab92220013 /resources/coreboot/haswell/patches/0021-haswell-NRI-Add-RcvEn-training.patch
parenta942bd6590dd450140fc0af8549ca470a065adf5 (diff)
haswell boards: use libre mrc.bin replacement
courtesy of Angel Pons from the coreboot project this uses the following patch set from gerrit, as yet unmerged (in coreboot master) on this date: https://review.coreboot.org/c/coreboot/+/64198/5 logic for downloading mrc blobs has been deleted from lbmk, as this is now completely obsolete (for haswell boards) if other platforms are added later that need mrc.bin, then logic will be re-added again for that
Diffstat (limited to 'resources/coreboot/haswell/patches/0021-haswell-NRI-Add-RcvEn-training.patch')
-rw-r--r--resources/coreboot/haswell/patches/0021-haswell-NRI-Add-RcvEn-training.patch708
1 files changed, 708 insertions, 0 deletions
diff --git a/resources/coreboot/haswell/patches/0021-haswell-NRI-Add-RcvEn-training.patch b/resources/coreboot/haswell/patches/0021-haswell-NRI-Add-RcvEn-training.patch
new file mode 100644
index 00000000..9139a67e
--- /dev/null
+++ b/resources/coreboot/haswell/patches/0021-haswell-NRI-Add-RcvEn-training.patch
@@ -0,0 +1,708 @@
+From ac8843553af34855d0331554c03280e66c4ea582 Mon Sep 17 00:00:00 2001
+From: Angel Pons <th3fanbus@gmail.com>
+Date: Sun, 8 May 2022 00:05:41 +0200
+Subject: [PATCH 21/26] haswell NRI: Add RcvEn training
+
+Implement the RcvEn (Receive Enable) calibration procedure.
+
+Change-Id: Ifbfa520f3e0486c56d0988ce67af2ddb9cf29888
+Signed-off-by: Angel Pons <th3fanbus@gmail.com>
+---
+ .../intel/haswell/native_raminit/Makefile.inc | 1 +
+ .../haswell/native_raminit/raminit_main.c | 1 +
+ .../haswell/native_raminit/raminit_native.h | 14 +
+ .../haswell/native_raminit/reg_structs.h | 13 +
+ .../native_raminit/train_receive_enable.c | 561 ++++++++++++++++++
+ .../intel/haswell/registers/mchbar.h | 3 +
+ 6 files changed, 593 insertions(+)
+ create mode 100644 src/northbridge/intel/haswell/native_raminit/train_receive_enable.c
+
+diff --git a/src/northbridge/intel/haswell/native_raminit/Makefile.inc b/src/northbridge/intel/haswell/native_raminit/Makefile.inc
+index ebe9e9b762..e2fbfb4211 100644
+--- a/src/northbridge/intel/haswell/native_raminit/Makefile.inc
++++ b/src/northbridge/intel/haswell/native_raminit/Makefile.inc
+@@ -16,3 +16,4 @@ romstage-y += setup_wdb.c
+ romstage-y += spd_bitmunching.c
+ romstage-y += testing_io.c
+ romstage-y += timings_refresh.c
++romstage-y += train_receive_enable.c
+diff --git a/src/northbridge/intel/haswell/native_raminit/raminit_main.c b/src/northbridge/intel/haswell/native_raminit/raminit_main.c
+index 5e4674957d..7d444659c3 100644
+--- a/src/northbridge/intel/haswell/native_raminit/raminit_main.c
++++ b/src/northbridge/intel/haswell/native_raminit/raminit_main.c
+@@ -60,6 +60,7 @@ static const struct task_entry cold_boot[] = {
+ { configure_memory_map, true, "MEMMAP", },
+ { do_jedec_init, true, "JEDECINIT", },
+ { pre_training, true, "PRETRAIN", },
++ { train_receive_enable, true, "RCVET", },
+ };
+
+ /* Return a generic stepping value to make stepping checks simpler */
+diff --git a/src/northbridge/intel/haswell/native_raminit/raminit_native.h b/src/northbridge/intel/haswell/native_raminit/raminit_native.h
+index 7c1a786780..a36ebfacd1 100644
+--- a/src/northbridge/intel/haswell/native_raminit/raminit_native.h
++++ b/src/northbridge/intel/haswell/native_raminit/raminit_native.h
+@@ -42,6 +42,9 @@
+ #define NUM_WDB_CL_MUX_SEEDS 3
+ #define NUM_CADB_MUX_SEEDS 3
+
++/* Specified in PI ticks. 64 PI ticks == 1 qclk */
++#define tDQSCK_DRIFT 64
++
+ /* ZQ calibration types */
+ enum {
+ ZQ_INIT, /* DDR3: ZQCL with tZQinit, LPDDR3: ZQ Init with tZQinit */
+@@ -188,6 +191,7 @@ enum raminit_status {
+ RAMINIT_STATUS_MPLL_INIT_FAILURE,
+ RAMINIT_STATUS_POLL_TIMEOUT,
+ RAMINIT_STATUS_REUT_ERROR,
++ RAMINIT_STATUS_RCVEN_FAILURE,
+ RAMINIT_STATUS_UNSPECIFIED_ERROR, /** TODO: Deprecated in favor of specific values **/
+ };
+
+@@ -270,6 +274,10 @@ struct sysinfo {
+
+ union ddr_data_vref_adjust_reg dimm_vref;
+
++ uint8_t io_latency[NUM_CHANNELS][NUM_SLOTRANKS];
++ uint8_t rt_latency[NUM_CHANNELS][NUM_SLOTRANKS];
++ uint32_t rt_io_comp[NUM_CHANNELS];
++
+ uint32_t data_offset_train[NUM_CHANNELS][NUM_LANES];
+ uint32_t data_offset_comp[NUM_CHANNELS][NUM_LANES];
+
+@@ -344,6 +352,11 @@ static inline void clear_data_offset_train_all(struct sysinfo *ctrl)
+ memset(ctrl->data_offset_train, 0, sizeof(ctrl->data_offset_train));
+ }
+
++static inline uint32_t get_data_train_feedback(const uint8_t channel, const uint8_t byte)
++{
++ return mchbar_read32(DDR_DATA_TRAIN_FEEDBACK(channel, byte));
++}
++
+ /* Number of ticks to wait in units of 69.841279 ns (citation needed) */
+ static inline void tick_delay(const uint32_t delay)
+ {
+@@ -401,6 +414,7 @@ enum raminit_status convert_timings(struct sysinfo *ctrl);
+ enum raminit_status configure_mc(struct sysinfo *ctrl);
+ enum raminit_status configure_memory_map(struct sysinfo *ctrl);
+ enum raminit_status do_jedec_init(struct sysinfo *ctrl);
++enum raminit_status train_receive_enable(struct sysinfo *ctrl);
+
+ void configure_timings(struct sysinfo *ctrl);
+ void configure_refresh(struct sysinfo *ctrl);
+diff --git a/src/northbridge/intel/haswell/native_raminit/reg_structs.h b/src/northbridge/intel/haswell/native_raminit/reg_structs.h
+index b943259b91..b099f4bb82 100644
+--- a/src/northbridge/intel/haswell/native_raminit/reg_structs.h
++++ b/src/northbridge/intel/haswell/native_raminit/reg_structs.h
+@@ -297,6 +297,19 @@ union ddr_scram_misc_control_reg {
+ uint32_t raw;
+ };
+
++union sc_io_latency_reg {
++ struct __packed {
++ uint32_t iolat_rank0 : 4; // Bits 3:0
++ uint32_t iolat_rank1 : 4; // Bits 7:4
++ uint32_t iolat_rank2 : 4; // Bits 11:8
++ uint32_t iolat_rank3 : 4; // Bits 15:12
++ uint32_t rt_iocomp : 6; // Bits 21:16
++ uint32_t : 9; // Bits 30:22
++ uint32_t dis_rt_clk_gate : 1; // Bits 31:31
++ };
++ uint32_t raw;
++};
++
+ union mcscheds_cbit_reg {
+ struct __packed {
+ uint32_t dis_opp_cas : 1; // Bits 0:0
+diff --git a/src/northbridge/intel/haswell/native_raminit/train_receive_enable.c b/src/northbridge/intel/haswell/native_raminit/train_receive_enable.c
+new file mode 100644
+index 0000000000..576c6bc21e
+--- /dev/null
++++ b/src/northbridge/intel/haswell/native_raminit/train_receive_enable.c
+@@ -0,0 +1,561 @@
++/* SPDX-License-Identifier: GPL-2.0-or-later */
++
++#include <console/console.h>
++#include <northbridge/intel/haswell/haswell.h>
++#include <types.h>
++
++#include "raminit_native.h"
++#include "ranges.h"
++
++#define RCVEN_PLOT RAM_DEBUG
++
++static enum raminit_status change_rcven_timing(struct sysinfo *ctrl, const uint8_t channel)
++{
++ int16_t max_rcven = -4096;
++ int16_t min_rcven = 4096;
++ int16_t max_rcven_rank[NUM_SLOTRANKS];
++ int16_t min_rcven_rank[NUM_SLOTRANKS];
++ for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
++ max_rcven_rank[rank] = max_rcven;
++ min_rcven_rank[rank] = min_rcven;
++ }
++ for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ int16_t new_rcven = ctrl->rcven[channel][rank][byte];
++ new_rcven -= ctrl->io_latency[channel][rank] * 64;
++ if (max_rcven_rank[rank] < new_rcven)
++ max_rcven_rank[rank] = new_rcven;
++
++ if (min_rcven_rank[rank] > new_rcven)
++ min_rcven_rank[rank] = new_rcven;
++ }
++ if (max_rcven < max_rcven_rank[rank])
++ max_rcven = max_rcven_rank[rank];
++
++ if (min_rcven > min_rcven_rank[rank])
++ min_rcven = min_rcven_rank[rank];
++ }
++
++ /*
++ * Determine how far we are from the ideal center point for RcvEn timing.
++ * (PiIdeal - AveRcvEn) / 64 is the ideal number of cycles we should have
++ * for IO latency. command training will reduce this by 64, so plan for
++ * that now in the ideal value. Round to closest integer.
++ */
++ const int16_t rre_pi_ideal = 256 + 64;
++ const int16_t pi_reserve = 64;
++ const int16_t rcven_center = (max_rcven + min_rcven) / 2;
++ const int8_t iolat_target = DIV_ROUND_CLOSEST(rre_pi_ideal - rcven_center, 64);
++
++ int8_t io_g_offset = 0;
++ int8_t io_lat[NUM_SLOTRANKS] = { 0 };
++ for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ io_lat[rank] = iolat_target;
++
++ /* Check for RcvEn underflow/overflow */
++ const int16_t rcven_lower = 64 * io_lat[rank] + min_rcven_rank[rank];
++ if (rcven_lower < pi_reserve)
++ io_lat[rank] += DIV_ROUND_UP(pi_reserve - rcven_lower, 64);
++
++ const int16_t rcven_upper = 64 * io_lat[rank] + max_rcven_rank[rank];
++ if (rcven_upper > 511 - pi_reserve)
++ io_lat[rank] -= DIV_ROUND_UP(rcven_upper - (511 - pi_reserve), 64);
++
++ /* Check for IO latency over/underflow */
++ if (io_lat[rank] - io_g_offset > 14)
++ io_g_offset = io_lat[rank] - 14;
++
++ if (io_lat[rank] - io_g_offset < 1)
++ io_g_offset = io_lat[rank] - 1;
++
++ const int8_t cycle_offset = io_lat[rank] - ctrl->io_latency[channel][rank];
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ ctrl->rcven[channel][rank][byte] += 64 * cycle_offset;
++ update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
++ }
++ }
++
++ /* Calculate new IO comp latency */
++ union sc_io_latency_reg sc_io_lat = {
++ .raw = mchbar_read32(SC_IO_LATENCY_ch(channel)),
++ };
++
++ /* Check if we are underflowing or overflowing this field */
++ if (io_g_offset < 0 && sc_io_lat.rt_iocomp < -io_g_offset) {
++ printk(BIOS_ERR, "%s: IO COMP underflow\n", __func__);
++ printk(BIOS_ERR, "io_g_offset: %d\n", io_g_offset);
++ printk(BIOS_ERR, "rt_iocomp: %u\n", sc_io_lat.rt_iocomp);
++ return RAMINIT_STATUS_RCVEN_FAILURE;
++ }
++ if (io_g_offset > 0 && io_g_offset > 0x3f - sc_io_lat.rt_iocomp) {
++ printk(BIOS_ERR, "%s: IO COMP overflow\n", __func__);
++ printk(BIOS_ERR, "io_g_offset: %d\n", io_g_offset);
++ printk(BIOS_ERR, "rt_iocomp: %u\n", sc_io_lat.rt_iocomp);
++ return RAMINIT_STATUS_RCVEN_FAILURE;
++ }
++ sc_io_lat.rt_iocomp += io_g_offset;
++ ctrl->rt_io_comp[channel] = sc_io_lat.rt_iocomp;
++ for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
++ if (ctrl->rankmap[channel] & BIT(rank))
++ ctrl->io_latency[channel][rank] = io_lat[rank] - io_g_offset;
++
++ const uint8_t shift = rank * 4;
++ sc_io_lat.raw &= ~(0xf << shift);
++ sc_io_lat.raw |= ctrl->io_latency[channel][rank] << shift;
++ }
++ mchbar_write32(SC_IO_LATENCY_ch(channel), sc_io_lat.raw);
++ return RAMINIT_STATUS_SUCCESS;
++}
++
++#define RL_START (256 + 24)
++#define RL_STOP (384 + 24)
++#define RL_STEP 8
++
++#define RE_NUM_SAMPLES 6
++
++static enum raminit_status verify_high_region(const int32_t center, const int32_t lwidth)
++{
++ if (center > RL_STOP) {
++ /* Check if center of high was found where it should be */
++ printk(BIOS_ERR, "RcvEn: Center of high (%d) higher than expected\n", center);
++ return RAMINIT_STATUS_RCVEN_FAILURE;
++ }
++ if (lwidth <= 32) {
++ /* Check if width is large enough */
++ printk(BIOS_ERR, "RcvEn: Width of high region (%d) too small\n", lwidth);
++ return RAMINIT_STATUS_RCVEN_FAILURE;
++ }
++ if (lwidth >= 96) {
++ /* Since we're calibrating a phase, a too large region is a problem */
++ printk(BIOS_ERR, "RcvEn: Width of high region (%d) too large\n", lwidth);
++ return RAMINIT_STATUS_RCVEN_FAILURE;
++ }
++ return RAMINIT_STATUS_SUCCESS;
++}
++
++static void program_io_latency(struct sysinfo *ctrl, const uint8_t channel, const uint8_t rank)
++{
++ const uint8_t shift = rank * 4;
++ const uint8_t iolat = ctrl->io_latency[channel][rank];
++ mchbar_clrsetbits32(SC_IO_LATENCY_ch(channel), 0xf << shift, iolat << shift);
++}
++
++static void program_rl_delays(struct sysinfo *ctrl, const uint8_t rank, const uint16_t rl_delay)
++{
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
++ update_rxt(ctrl, channel, rank, byte, RXT_RCVEN, rl_delay);
++ }
++}
++
++static bool sample_dqs(const uint8_t channel, const uint8_t byte)
++{
++ return (get_data_train_feedback(channel, byte) & 0x1ff) >= BIT(RE_NUM_SAMPLES - 1);
++}
++
++enum raminit_status train_receive_enable(struct sysinfo *ctrl)
++{
++ const struct reut_box reut_addr = {
++ .col = {
++ .start = 0,
++ .stop = 1023,
++ .inc_rate = 0,
++ .inc_val = 1,
++ },
++ };
++ const struct wdb_pat wdb_pattern = {
++ .start_ptr = 0,
++ .stop_ptr = 9,
++ .inc_rate = 32,
++ .dq_pattern = BASIC_VA,
++ };
++
++ const uint16_t bytemask = BIT(ctrl->lanes) - 1;
++ const uint8_t fine_step = 1;
++
++ const uint8_t rt_delta = is_hsw_ult() ? 4 : 2;
++ const uint8_t rt_io_comp = 21 + rt_delta;
++ const uint8_t rt_latency = 16 + rt_delta;
++ setup_io_test(
++ ctrl,
++ ctrl->chanmap,
++ PAT_RD,
++ 2,
++ RE_NUM_SAMPLES + 1,
++ &reut_addr,
++ 0,
++ &wdb_pattern,
++ 0,
++ 8);
++
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!does_ch_exist(ctrl, channel))
++ continue;
++
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ union ddr_data_control_2_reg data_control_2 = {
++ .raw = ctrl->dq_control_2[channel][byte],
++ };
++ data_control_2.force_rx_on = 1;
++ mchbar_write32(DQ_CONTROL_2(channel, byte), data_control_2.raw);
++ }
++ union ddr_data_control_0_reg data_control_0 = {
++ .raw = ctrl->dq_control_0[channel],
++ };
++ if (ctrl->lpddr) {
++ /**
++ * W/A for b4618574 - @todo: remove for HSW ULT C0
++ * Can't have force_odt_on together with leaker, disable LPDDR
++ * mode during this training step. lpddr_mode is restored
++ * at the end of this function from the host structure.
++ */
++ data_control_0.lpddr_mode = 0;
++ mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
++ }
++ data_control_0.force_odt_on = 1;
++ data_control_0.rl_training_mode = 1;
++ mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
++ mchbar_write32(SC_IO_LATENCY_ch(channel), (union sc_io_latency_reg) {
++ .rt_iocomp = rt_io_comp,
++ }.raw);
++ }
++ enum raminit_status status = RAMINIT_STATUS_SUCCESS;
++ for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
++ if (!does_rank_exist(ctrl, rank))
++ continue;
++
++ /*
++ * Set initial roundtrip latency values. Assume -4 QCLK for worst board
++ * layout. This is calculated as HW_ROUNDT_LAT_DEFAULT_VALUE plus:
++ *
++ * DDR3: Default + (2 * tAA) + 4 QCLK + PI_CLK + N-mode value * 2
++ * LPDDR3: Default + (2 * tAA) + 4 QCLK + PI_CLK + tDQSCK_max
++ *
++ * N-mode is 3 during training mode. Both channels use the same timings.
++ */
++ /** TODO: differs for LPDDR **/
++ const uint32_t tmp = MAX(ctrl->multiplier, 4) + 5 + 2 * ctrl->tAA;
++ const uint32_t initial_rt_latency = MIN(rt_latency + tmp, 0x3f);
++
++ uint8_t chanmask = 0;
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ chanmask |= select_reut_ranks(ctrl, channel, BIT(rank));
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ ctrl->io_latency[channel][rank] = 0;
++ mchbar_write8(SC_ROUNDT_LAT_ch(channel) + rank, initial_rt_latency);
++ ctrl->rt_latency[channel][rank] = initial_rt_latency;
++ }
++
++ printk(BIOS_DEBUG, "Rank %u\n", rank);
++ printk(BIOS_DEBUG, "Steps 1 and 2: Find middle of high region\n");
++ printk(RCVEN_PLOT, "Byte");
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ printk(RCVEN_PLOT, "\t");
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
++ printk(RCVEN_PLOT, "%u ", byte);
++ }
++ printk(RCVEN_PLOT, "\nRcvEn\n");
++ struct phase_train_data region_data[NUM_CHANNELS][NUM_LANES] = { 0 };
++ for (uint16_t rl_delay = RL_START; rl_delay < RL_STOP; rl_delay += RL_STEP) {
++ printk(RCVEN_PLOT, " % 3d", rl_delay);
++ program_rl_delays(ctrl, rank, rl_delay);
++ run_io_test(ctrl, chanmask, BASIC_VA, true);
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ printk(RCVEN_PLOT, "\t");
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ const bool high = sample_dqs(channel, byte);
++ printk(RCVEN_PLOT, high ? ". " : "# ");
++ phase_record_pass(
++ &region_data[channel][byte],
++ high,
++ rl_delay,
++ RL_START,
++ RL_STEP);
++ }
++ }
++ printk(RCVEN_PLOT, "\n");
++ }
++ printk(RCVEN_PLOT, "\n");
++ printk(BIOS_DEBUG, "Update RcvEn timing to be in the center of high region\n");
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ printk(BIOS_DEBUG, "C%u.R%u: \tLeft\tRight\tWidth\tCenter\n",
++ channel, rank);
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ struct phase_train_data *const curr_data =
++ &region_data[channel][byte];
++ phase_append_current_to_initial(curr_data, RL_START, RL_STEP);
++ const int32_t lwidth = range_width(curr_data->largest);
++ const int32_t center = range_center(curr_data->largest);
++ printk(BIOS_DEBUG, " B%u: \t%d\t%d\t%d\t%d\n",
++ byte,
++ curr_data->largest.start,
++ curr_data->largest.end,
++ lwidth,
++ center);
++
++ status = verify_high_region(center, lwidth);
++ if (status) {
++ printk(BIOS_ERR,
++ "RcvEn problems on channel %u, byte %u\n",
++ channel, byte);
++ goto clean_up;
++ }
++ ctrl->rcven[channel][rank][byte] = center;
++ update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
++ }
++ printk(BIOS_DEBUG, "\n");
++ }
++
++ printk(BIOS_DEBUG, "Step 3: Quarter preamble - Walk backwards\n");
++ printk(RCVEN_PLOT, "Byte");
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ printk(RCVEN_PLOT, "\t");
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
++ printk(RCVEN_PLOT, "%u ", byte);
++ }
++ printk(RCVEN_PLOT, "\nIOLAT\n");
++ bool done = false;
++ while (!done) {
++ run_io_test(ctrl, chanmask, BASIC_VA, true);
++ done = true;
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ printk(RCVEN_PLOT, " %2u\t", ctrl->io_latency[channel][rank]);
++ uint16_t highs = 0;
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ const bool high = sample_dqs(channel, byte);
++ printk(RCVEN_PLOT, high ? "H " : "L ");
++ if (high)
++ highs |= BIT(byte);
++ }
++ if (!highs)
++ continue;
++
++ done = false;
++
++ /* If all bytes sample high, adjust timing globally */
++ if (highs == bytemask && ctrl->io_latency[channel][rank] < 14) {
++ ctrl->io_latency[channel][rank] += 2;
++ ctrl->io_latency[channel][rank] %= 16;
++ program_io_latency(ctrl, channel, rank);
++ continue;
++ }
++
++ /* Otherwise, adjust individual bytes */
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ if (!(highs & BIT(byte)))
++ continue;
++
++ if (ctrl->rcven[channel][rank][byte] < 128) {
++ printk(BIOS_ERR,
++ "RcvEn underflow: walking backwards\n");
++ printk(BIOS_ERR,
++ "For channel %u, rank %u, byte %u\n",
++ channel, rank, byte);
++ status = RAMINIT_STATUS_RCVEN_FAILURE;
++ goto clean_up;
++ }
++ ctrl->rcven[channel][rank][byte] -= 128;
++ update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
++ }
++ }
++ printk(RCVEN_PLOT, "\n");
++ }
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ printk(BIOS_DEBUG, "\nC%u: Preamble\n", channel);
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ printk(BIOS_DEBUG,
++ " B%u: %u\n", byte, ctrl->rcven[channel][rank][byte]);
++ }
++ }
++ printk(BIOS_DEBUG, "\n");
++
++ printk(BIOS_DEBUG, "Step 4: Add 1 qclk\n");
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ ctrl->rcven[channel][rank][byte] += 64;
++ update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
++ }
++ }
++ printk(BIOS_DEBUG, "\n");
++
++ printk(BIOS_DEBUG, "Step 5: Walk forward to find rising edge\n");
++ printk(RCVEN_PLOT, "Byte");
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ printk(RCVEN_PLOT, "\t");
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
++ printk(RCVEN_PLOT, "%u ", byte);
++ }
++ printk(RCVEN_PLOT, "\n inc\n");
++ uint16_t ch_result[NUM_CHANNELS] = { 0 };
++ uint8_t inc_preamble[NUM_CHANNELS][NUM_LANES] = { 0 };
++ for (uint8_t inc = 0; inc < 64; inc += fine_step) {
++ printk(RCVEN_PLOT, " %2u\t", inc);
++ run_io_test(ctrl, chanmask, BASIC_VA, true);
++ done = true;
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ if (ch_result[channel] & BIT(byte)) {
++ /* Skip bytes that are already done */
++ printk(RCVEN_PLOT, ". ");
++ continue;
++ }
++ const bool pass = sample_dqs(channel, byte);
++ printk(RCVEN_PLOT, pass ? ". " : "# ");
++ if (pass) {
++ ch_result[channel] |= BIT(byte);
++ continue;
++ }
++ ctrl->rcven[channel][rank][byte] += fine_step;
++ update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
++ inc_preamble[channel][byte] = inc;
++ }
++ printk(RCVEN_PLOT, "\t");
++ if (ch_result[channel] != bytemask)
++ done = false;
++ }
++ printk(RCVEN_PLOT, "\n");
++ if (done)
++ break;
++ }
++ printk(BIOS_DEBUG, "\n");
++ if (!done) {
++ printk(BIOS_ERR, "Error: Preamble edge not found for all bytes\n");
++ printk(BIOS_ERR, "The final RcvEn results are as follows:\n");
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ printk(BIOS_ERR, "Channel %u Rank %u: preamble\n",
++ channel, rank);
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ printk(BIOS_ERR, " Byte %u: %u%s\n", byte,
++ ctrl->rcven[channel][rank][byte],
++ (ch_result[channel] ^ bytemask) & BIT(byte)
++ ? ""
++ : " *** Check this byte! ***");
++ }
++ }
++ status = RAMINIT_STATUS_RCVEN_FAILURE;
++ goto clean_up;
++ }
++
++ printk(BIOS_DEBUG, "Step 6: center on preamble and clean up rank\n");
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ printk(BIOS_DEBUG, "C%u: Preamble increment\n", channel);
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ /*
++ * For Traditional, pull in RcvEn by 64. For ULT, take the DQS
++ * drift into account to the specified guardband: tDQSCK_DRIFT.
++ */
++ ctrl->rcven[channel][rank][byte] -= tDQSCK_DRIFT;
++ update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
++ printk(BIOS_DEBUG, " B%u: %u %u\n", byte,
++ ctrl->rcven[channel][rank][byte],
++ inc_preamble[channel][byte]);
++ }
++ printk(BIOS_DEBUG, "\n");
++ }
++ printk(BIOS_DEBUG, "\n");
++ }
++
++clean_up:
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!does_ch_exist(ctrl, channel))
++ continue;
++
++ if (ctrl->lpddr) {
++ /**
++ * W/A for b4618574 - @todo: remove for HSW ULT C0
++ * Can't have force_odt_on together with leaker, disable LPDDR mode for
++ * this training step. This write will disable force_odt_on while still
++ * keeping LPDDR mode disabled. Second write will restore LPDDR mode.
++ */
++ union ddr_data_control_0_reg data_control_0 = {
++ .raw = ctrl->dq_control_0[channel],
++ };
++ data_control_0.lpddr_mode = 0;
++ mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
++ }
++ mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), ctrl->dq_control_0[channel]);
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ mchbar_write32(DQ_CONTROL_2(channel, byte),
++ ctrl->dq_control_2[channel][byte]);
++ }
++ }
++ io_reset();
++ if (status)
++ return status;
++
++ printk(BIOS_DEBUG, "Step 7: Sync IO latency across all ranks\n");
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!does_ch_exist(ctrl, channel))
++ continue;
++
++ status = change_rcven_timing(ctrl, channel);
++ if (status)
++ return status;
++ }
++ printk(BIOS_DEBUG, "\nFinal Receive Enable and IO latency settings:\n");
++ for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
++ if (!does_ch_exist(ctrl, channel))
++ continue;
++
++ for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
++ if (!rank_in_ch(ctrl, rank, channel))
++ continue;
++
++ const union sc_io_latency_reg sc_io_latency = {
++ .raw = mchbar_read32(SC_IO_LATENCY_ch(channel)),
++ };
++ printk(BIOS_DEBUG, " C%u.R%u: IOLAT = %u rt_iocomp = %u\n", channel,
++ rank, ctrl->io_latency[channel][rank], sc_io_latency.rt_iocomp);
++ for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
++ printk(BIOS_DEBUG, " B%u: %u\n", byte,
++ ctrl->rcven[channel][rank][byte]);
++ }
++ printk(BIOS_DEBUG, "\n");
++ }
++ }
++ return status;
++}
+diff --git a/src/northbridge/intel/haswell/registers/mchbar.h b/src/northbridge/intel/haswell/registers/mchbar.h
+index a81559bb1e..9172d4f2b0 100644
+--- a/src/northbridge/intel/haswell/registers/mchbar.h
++++ b/src/northbridge/intel/haswell/registers/mchbar.h
+@@ -18,6 +18,8 @@
+ #define RX_TRAIN_ch_r_b(ch, rank, byte) _DDRIO_C_R_B(0x0000, ch, rank, byte)
+ #define TX_TRAIN_ch_r_b(ch, rank, byte) _DDRIO_C_R_B(0x0020, ch, rank, byte)
+
++#define DDR_DATA_TRAIN_FEEDBACK(ch, byte) _DDRIO_C_R_B(0x0054, ch, 0, byte)
++
+ #define DQ_CONTROL_2(ch, byte) _DDRIO_C_R_B(0x0064, ch, 0, byte)
+ #define DQ_CONTROL_0(ch, byte) _DDRIO_C_R_B(0x0074, ch, 0, byte)
+
+@@ -100,6 +102,7 @@
+ #define COMMAND_RATE_LIMIT_ch(ch) _MCMAIN_C(0x4010, ch)
+ #define TC_BANK_RANK_D_ch(ch) _MCMAIN_C(0x4014, ch)
+ #define SC_ROUNDT_LAT_ch(ch) _MCMAIN_C(0x4024, ch)
++#define SC_IO_LATENCY_ch(ch) _MCMAIN_C(0x4028, ch)
+
+ #define REUT_ch_PAT_WDB_CL_MUX_CFG(ch) _MCMAIN_C(0x4040, ch)
+
+--
+2.39.2
+