From be58501141aa97aa544b670e566cd6cf6797c18e Mon Sep 17 00:00:00 2001
From: Angel Pons <th3fanbus@gmail.com>
Date: Wed, 17 Apr 2024 13:20:32 +0200
Subject: [PATCH 17/20] haswell NRI: Do sense amplifier offset training

Quoting Wikipedia:

  A sense amplifier is a circuit that is used to amplify and detect
  small signals in electronic systems. It is commonly used in memory
  circuits, such as dynamic random access memory (DRAM), to read and
  amplify the weak signals stored in memory cells.

In this case, we're calibrating the sense amplifiers in the memory
controller. This training procedure uses a magic "sense amp offset
cancel" mode of the DDRIO to observe the sampled logic levels, and
sweeps Vref to find the low-high transition for each bit lane. The
procedure consists of two stages: the first stage centers per-byte
Vref (to ensure per-bit Vref offsets are as small as possible) and
the second stage centers per-bit Vref.

Because this procedure uses the "sense amp offset cancel" mode, it
does not rely on DRAM being trained. It is assumed that the memory
controller simply makes sense amp output levels observable via the
`DDR_DATA_TRAIN_FEEDBACK` register and that the memory bus is idle
during this training step (so the lane voltage is Vdd / 2).

Note: This procedure will need to be adapted for Broadwell because
it has per-rank per-bit RxVref registers, whereas Haswell only has
a single per-bit RxVref register for all ranks.

Change-Id: Ia07db68763f90e9701c8a376e01279ada8dbbe07
Signed-off-by: Angel Pons <th3fanbus@gmail.com>
---
 .../intel/haswell/native_raminit/Makefile.mk  |   1 +
 .../haswell/native_raminit/raminit_main.c     |   1 +
 .../haswell/native_raminit/raminit_native.h   |  12 +
 .../native_raminit/train_sense_amp_offset.c   | 341 ++++++++++++++++++
 .../intel/haswell/registers/mchbar.h          |   2 +
 5 files changed, 357 insertions(+)
 create mode 100644 src/northbridge/intel/haswell/native_raminit/train_sense_amp_offset.c

diff --git a/src/northbridge/intel/haswell/native_raminit/Makefile.mk b/src/northbridge/intel/haswell/native_raminit/Makefile.mk
index 8fdd17c542..4bd668a2d6 100644
--- a/src/northbridge/intel/haswell/native_raminit/Makefile.mk
+++ b/src/northbridge/intel/haswell/native_raminit/Makefile.mk
@@ -21,3 +21,4 @@ romstage-y += timings_refresh.c
 romstage-y += train_jedec_write_leveling.c
 romstage-y += train_read_mpr.c
 romstage-y += train_receive_enable.c
+romstage-y += train_sense_amp_offset.c
diff --git a/src/northbridge/intel/haswell/native_raminit/raminit_main.c b/src/northbridge/intel/haswell/native_raminit/raminit_main.c
index 056dde1adc..ce637e2d03 100644
--- a/src/northbridge/intel/haswell/native_raminit/raminit_main.c
+++ b/src/northbridge/intel/haswell/native_raminit/raminit_main.c
@@ -60,6 +60,7 @@ static const struct task_entry cold_boot[] = {
 	{ configure_memory_map,                                   true, "MEMMAP",     },
 	{ do_jedec_init,                                          true, "JEDECINIT",  },
 	{ pre_training,                                           true, "PRETRAIN",   },
+	{ train_sense_amp_offset,                                 true, "SOT",        },
 	{ train_receive_enable,                                   true, "RCVET",      },
 	{ train_read_mpr,                                         true, "RDMPRT",     },
 	{ train_jedec_write_leveling,                             true, "JWRL",       },
diff --git a/src/northbridge/intel/haswell/native_raminit/raminit_native.h b/src/northbridge/intel/haswell/native_raminit/raminit_native.h
index 0750904aec..95ccd0a8b3 100644
--- a/src/northbridge/intel/haswell/native_raminit/raminit_native.h
+++ b/src/northbridge/intel/haswell/native_raminit/raminit_native.h
@@ -22,6 +22,8 @@
 #define NUM_LANES		9
 #define NUM_LANES_NO_ECC	8
 
+#define NUM_BITS		8
+
 #define COMP_INT		10
 
 /* Always use 12 legs for emphasis (not trained) */
@@ -218,6 +220,7 @@ enum raminit_status {
 	RAMINIT_STATUS_MPLL_INIT_FAILURE,
 	RAMINIT_STATUS_POLL_TIMEOUT,
 	RAMINIT_STATUS_REUT_ERROR,
+	RAMINIT_STATUS_SAMP_OFFSET_FAILURE,
 	RAMINIT_STATUS_RCVEN_FAILURE,
 	RAMINIT_STATUS_RMPR_FAILURE,
 	RAMINIT_STATUS_JWRL_FAILURE,
@@ -243,6 +246,12 @@ struct raminit_dimm_info {
 	bool valid;
 };
 
+struct vref_margin {
+	uint8_t low;
+	uint8_t center;
+	uint8_t high;
+};
+
 struct sysinfo {
 	enum raminit_boot_mode bootmode;
 	enum generic_stepping stepping;
@@ -330,6 +339,8 @@ struct sysinfo {
 	uint8_t rxdqsn[NUM_CHANNELS][NUM_SLOTRANKS][NUM_LANES];
 	int8_t  rxvref[NUM_CHANNELS][NUM_SLOTRANKS][NUM_LANES];
 
+	struct vref_margin rxdqvrefpb[NUM_CHANNELS][NUM_SLOTRANKS][NUM_LANES][NUM_BITS];
+
 	uint8_t clk_pi_code[NUM_CHANNELS][NUM_SLOTRANKS];
 	uint8_t ctl_pi_code[NUM_CHANNELS][NUM_SLOTRANKS];
 	uint8_t cke_pi_code[NUM_CHANNELS][NUM_SLOTRANKS];
@@ -452,6 +463,7 @@ enum raminit_status convert_timings(struct sysinfo *ctrl);
 enum raminit_status configure_mc(struct sysinfo *ctrl);
 enum raminit_status configure_memory_map(struct sysinfo *ctrl);
 enum raminit_status do_jedec_init(struct sysinfo *ctrl);
+enum raminit_status train_sense_amp_offset(struct sysinfo *ctrl);
 enum raminit_status train_receive_enable(struct sysinfo *ctrl);
 enum raminit_status train_read_mpr(struct sysinfo *ctrl);
 enum raminit_status train_jedec_write_leveling(struct sysinfo *ctrl);
diff --git a/src/northbridge/intel/haswell/native_raminit/train_sense_amp_offset.c b/src/northbridge/intel/haswell/native_raminit/train_sense_amp_offset.c
new file mode 100644
index 0000000000..d4f199fefb
--- /dev/null
+++ b/src/northbridge/intel/haswell/native_raminit/train_sense_amp_offset.c
@@ -0,0 +1,341 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include <assert.h>
+#include <commonlib/bsd/clamp.h>
+#include <console/console.h>
+#include <delay.h>
+#include <lib.h>
+#include <types.h>
+
+#include "raminit_native.h"
+
+#define VREF_OFFSET_PLOT	RAM_DEBUG
+#define SAMP_OFFSET_PLOT	RAM_DEBUG
+
+struct vref_train_data {
+	int8_t best_sum;
+	int8_t best_vref;
+	int8_t sum_bits;
+	uint8_t high_mask;
+	uint8_t low_mask;
+};
+
+static enum raminit_status train_vref_offset(struct sysinfo *ctrl)
+{
+	const int8_t vref_start = -15;
+	const int8_t vref_stop  = 15;
+	const struct vref_train_data initial_vref_values = {
+		.best_sum  = -NUM_LANES,
+		.best_vref = 0,
+		.high_mask = 0,
+		.low_mask  = 0xff,
+	};
+	struct vref_train_data vref_data[NUM_CHANNELS][NUM_LANES];
+
+	printk(VREF_OFFSET_PLOT, "Plot of sum_bits across Vref settings\nChannel");
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		printk(VREF_OFFSET_PLOT, "\t%u\t\t", channel);
+	}
+
+	printk(VREF_OFFSET_PLOT, "\nByte");
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		printk(VREF_OFFSET_PLOT, "\t");
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+			printk(VREF_OFFSET_PLOT, "%u ", byte);
+			vref_data[channel][byte] = initial_vref_values;
+			union ddr_data_control_2_reg data_control_2 = {
+				.raw = ctrl->dq_control_2[channel][byte],
+			};
+			data_control_2.force_bias_on = 1;
+			data_control_2.force_rx_on   = 1;
+			mchbar_write32(DQ_CONTROL_2(channel, byte), data_control_2.raw);
+		}
+	}
+
+	/* Sweep through Vref settings and find point SampOffset of +/- 7 passes */
+	printk(VREF_OFFSET_PLOT, "\n1/2 Vref");
+	for (int8_t vref = vref_start; vref <= vref_stop; vref++) {
+		printk(VREF_OFFSET_PLOT, "\n% 3d", vref);
+
+		/*
+		 * To perform this test, enable offset cancel mode and enable ODT.
+		 * Check results and update variables. Ideal result is all zeroes.
+		 * Clear offset cancel mode at end of test to write RX_OFFSET_VDQ.
+		 */
+		change_1d_margin_multicast(ctrl, RdV, vref, 0, false, REG_FILE_USE_RANK);
+
+		/* Program settings for Vref and SampOffset = 7 (8 + 7) */
+		mchbar_write32(DDR_DATA_RX_OFFSET_VDQ, 0xffffffff);
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (!does_ch_exist(ctrl, channel))
+				continue;
+
+			/* Propagate delay values (without a read command) */
+			union ddr_data_control_0_reg data_control_0 = {
+				.raw = ctrl->dq_control_0[channel],
+			};
+			data_control_0.read_rf_rd      = 1;
+			data_control_0.read_rf_wr      = 0;
+			data_control_0.read_rf_rank    = 0;
+			data_control_0.force_odt_on    = 1;
+			data_control_0.samp_train_mode = 1;
+			mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
+			udelay(1);
+			data_control_0.samp_train_mode = 0;
+			mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+				const uint8_t feedback = get_data_train_feedback(channel, byte);
+				struct vref_train_data *curr_data = &vref_data[channel][byte];
+				curr_data->low_mask &= feedback;
+				curr_data->sum_bits = -popcnt(feedback);
+			}
+		}
+
+		/* Program settings for Vref and SampOffset = -7 (8 - 7) */
+		mchbar_write32(DDR_DATA_RX_OFFSET_VDQ, 0x11111111);
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (!does_ch_exist(ctrl, channel))
+				continue;
+
+			/* Propagate delay values (without a read command) */
+			union ddr_data_control_0_reg data_control_0 = {
+				.raw = ctrl->dq_control_0[channel],
+			};
+			data_control_0.read_rf_rd      = 1;
+			data_control_0.read_rf_wr      = 0;
+			data_control_0.read_rf_rank    = 0;
+			data_control_0.force_odt_on    = 1;
+			data_control_0.samp_train_mode = 1;
+			mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
+			udelay(1);
+			data_control_0.samp_train_mode = 0;
+			mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
+			printk(VREF_OFFSET_PLOT, "\t");
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+				const uint8_t feedback = get_data_train_feedback(channel, byte);
+				struct vref_train_data *curr_data = &vref_data[channel][byte];
+				curr_data->high_mask |= feedback;
+				curr_data->sum_bits += popcnt(feedback);
+				printk(VREF_OFFSET_PLOT, "%d ", curr_data->sum_bits);
+				if (curr_data->sum_bits > curr_data->best_sum) {
+					curr_data->best_sum  = curr_data->sum_bits;
+					curr_data->best_vref = vref;
+					ctrl->rxvref[channel][0][byte] = vref;
+				} else if (curr_data->sum_bits == curr_data->best_sum) {
+					curr_data->best_vref = vref;
+				}
+			}
+		}
+	}
+	printk(BIOS_DEBUG, "\n\nHi-Lo (XOR):");
+	enum raminit_status status = RAMINIT_STATUS_SUCCESS;
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		printk(BIOS_DEBUG, "\n  C%u:", channel);
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+			struct vref_train_data *const curr_data = &vref_data[channel][byte];
+			const uint8_t bit_xor = curr_data->high_mask ^ curr_data->low_mask;
+			printk(BIOS_DEBUG, "\t0x%02x", bit_xor);
+			if (bit_xor == 0xff)
+				continue;
+
+			/* Report an error if any bit did not change */
+			status = RAMINIT_STATUS_SAMP_OFFSET_FAILURE;
+		}
+	}
+	if (status)
+		printk(BIOS_ERR, "\nUnexpected bit error in Vref offset training\n");
+
+	printk(BIOS_DEBUG, "\n\nRdVref:");
+	change_1d_margin_multicast(ctrl, RdV, 0, 0, false, REG_FILE_USE_RANK);
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		printk(BIOS_DEBUG, "\n  C%u:", channel);
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+			struct vref_train_data *const curr_data = &vref_data[channel][byte];
+			const int8_t vref_width =
+				curr_data->best_vref - ctrl->rxvref[channel][0][byte];
+
+			/*
+			 * Step size for Rx Vref in DATA_OFFSET_TRAIN is about 3.9 mV
+			 * whereas Rx Vref step size in RX_TRAIN_RANK is about 7.8 mV
+			 */
+			int8_t vref = ctrl->rxvref[channel][0][byte] + vref_width / 2;
+			if (vref < 0)
+				vref--;
+			else
+				vref++;
+
+			for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
+				if (!rank_in_ch(ctrl, rank, channel))
+					continue;
+
+				ctrl->rxvref[channel][rank][byte] = vref / 2;
+				update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
+			}
+			printk(BIOS_DEBUG, "\t% 4d", ctrl->rxvref[channel][0][byte]);
+		}
+	}
+	printk(BIOS_DEBUG, "\n\n");
+	return status;
+}
+
+/**
+ * LPDDR has an additional bit for DQS per each byte.
+ *
+ * TODO: The DQS value must be written into Data Control 2.
+ */
+#define NUM_OFFSET_TRAIN_BITS	(NUM_BITS + 1)
+
+#define PLOT_CH_SPACE		"  "
+
+struct samp_train_data {
+	uint8_t first_zero;
+	uint8_t last_one;
+};
+
+static void train_samp_offset(struct sysinfo *ctrl)
+{
+	const uint8_t max_train_bits = ctrl->lpddr ? NUM_OFFSET_TRAIN_BITS : NUM_BITS;
+
+	struct samp_train_data samp_data[NUM_CHANNELS][NUM_LANES][NUM_OFFSET_TRAIN_BITS] = {0};
+
+	printk(BIOS_DEBUG, "Channel ");
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		printk(BIOS_DEBUG, "%u ", channel); /* Same length as PLOT_CH_SPACE */
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
+			printk(BIOS_DEBUG, "        %s ", ctrl->lpddr ? " " : "");
+	}
+	printk(BIOS_DEBUG, "\nByte    ");
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
+			printk(BIOS_DEBUG, "%u       %s ", byte, ctrl->lpddr ? " " : "");
+
+		printk(BIOS_DEBUG, PLOT_CH_SPACE);
+	}
+	printk(SAMP_OFFSET_PLOT, "\nBits    ");
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
+			printk(SAMP_OFFSET_PLOT, "01234567%s ", ctrl->lpddr ? "S" : "");
+
+		printk(SAMP_OFFSET_PLOT, PLOT_CH_SPACE);
+	}
+	printk(SAMP_OFFSET_PLOT, "\n SAmp\n");
+	for (uint8_t samp_offset = 1; samp_offset <= 15; samp_offset++) {
+		printk(SAMP_OFFSET_PLOT, "% 5d\t", samp_offset);
+
+		uint32_t rx_offset_vdq = 0;
+		for (uint8_t bit = 0; bit < NUM_BITS; bit++) {
+			rx_offset_vdq += samp_offset << (4 * bit);
+		}
+		mchbar_write32(DDR_DATA_RX_OFFSET_VDQ, rx_offset_vdq);
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (!does_ch_exist(ctrl, channel))
+				continue;
+
+			/* Propagate delay values (without a read command) */
+			union ddr_data_control_0_reg data_control_0 = {
+				.raw = ctrl->dq_control_0[channel],
+			};
+			data_control_0.read_rf_rd      = 1;
+			data_control_0.read_rf_wr      = 0;
+			data_control_0.read_rf_rank    = 0;
+			data_control_0.force_odt_on    = 1;
+			data_control_0.samp_train_mode = 1;
+			mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
+			udelay(1);
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+				const uint32_t feedback =
+					get_data_train_feedback(channel, byte);
+
+				for (uint8_t bit = 0; bit < max_train_bits; bit++) {
+					struct samp_train_data *const curr_data =
+							&samp_data[channel][byte][bit];
+					const bool result = feedback & BIT(bit);
+					if (result) {
+						curr_data->last_one = samp_offset;
+					} else if (curr_data->first_zero == 0) {
+						curr_data->first_zero = samp_offset;
+					}
+					printk(SAMP_OFFSET_PLOT, result ? "." : "#");
+				}
+				printk(SAMP_OFFSET_PLOT, " ");
+			}
+			printk(SAMP_OFFSET_PLOT, PLOT_CH_SPACE);
+			data_control_0.samp_train_mode = 0;
+			mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
+		}
+		printk(SAMP_OFFSET_PLOT, "\n");
+	}
+	printk(BIOS_DEBUG, "\nBitSAmp ");
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+			uint32_t rx_offset_vdq = 0;
+			for (uint8_t bit = 0; bit < max_train_bits; bit++) {
+				struct samp_train_data *const curr_data =
+						&samp_data[channel][byte][bit];
+
+				uint8_t vref = curr_data->first_zero + curr_data->last_one;
+				vref = clamp_u8(0, vref / 2, 15);
+				/*
+				 * Check for saturation conditions to make sure
+				 * we are as close as possible to Vdd/2 (750 mV).
+				 */
+				if (curr_data->first_zero == 0)
+					vref = 15;
+				if (curr_data->last_one == 0)
+					vref = 0;
+
+				ctrl->rxdqvrefpb[channel][0][byte][bit].center = vref;
+				rx_offset_vdq += vref & 0xf << (4 * bit);
+				printk(BIOS_DEBUG, "%x", vref);
+			}
+			mchbar_write32(RX_OFFSET_VDQ(channel, byte), rx_offset_vdq);
+			printk(BIOS_DEBUG, " ");
+			download_regfile(ctrl, channel, 1, 0, REG_FILE_USE_RANK, 0, 1, 0);
+		}
+		printk(BIOS_DEBUG, PLOT_CH_SPACE);
+	}
+	printk(BIOS_DEBUG, "\n");
+}
+
+enum raminit_status train_sense_amp_offset(struct sysinfo *ctrl)
+{
+	printk(BIOS_DEBUG, "Stage 1: Vref offset training\n");
+	const enum raminit_status status = train_vref_offset(ctrl);
+
+	printk(BIOS_DEBUG, "Stage 2: Samp offset training\n");
+	train_samp_offset(ctrl);
+
+	/* Clean up after test */
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), ctrl->dq_control_0[channel]);
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
+			mchbar_write32(DQ_CONTROL_2(channel, byte),
+				ctrl->dq_control_2[channel][byte]);
+	}
+	io_reset();
+	return status;
+}
diff --git a/src/northbridge/intel/haswell/registers/mchbar.h b/src/northbridge/intel/haswell/registers/mchbar.h
index 49a215aa71..1a168a3fc8 100644
--- a/src/northbridge/intel/haswell/registers/mchbar.h
+++ b/src/northbridge/intel/haswell/registers/mchbar.h
@@ -18,6 +18,8 @@
 #define RX_TRAIN_ch_r_b(ch, rank, byte)		_DDRIO_C_R_B(0x0000, ch, rank, byte)
 #define TX_TRAIN_ch_r_b(ch, rank, byte)		_DDRIO_C_R_B(0x0020, ch, rank, byte)
 
+#define RX_OFFSET_VDQ(ch, byte)			_DDRIO_C_R_B(0x004c, ch, 0, byte)
+
 #define DDR_DATA_TRAIN_FEEDBACK(ch, byte)	_DDRIO_C_R_B(0x0054, ch, 0, byte)
 
 #define DQ_CONTROL_1(ch, byte)			_DDRIO_C_R_B(0x0060, ch, 0, byte)
-- 
2.39.2