summaryrefslogtreecommitdiff
path: root/config/coreboot/haswell/patches/0011-haswell-NRI-Add-RcvEn-training.patch
blob: 4815be9aaed40692805878d4207873badd3a5897 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
From 4254a9ff03658d7a6f1a4e32cfe4c65dbfc072f8 Mon Sep 17 00:00:00 2001
From: Angel Pons <th3fanbus@gmail.com>
Date: Sun, 8 May 2022 00:05:41 +0200
Subject: [PATCH 11/20] haswell NRI: Add RcvEn training

Implement the RcvEn (Receive Enable) calibration procedure.

Change-Id: Ifbfa520f3e0486c56d0988ce67af2ddb9cf29888
Signed-off-by: Angel Pons <th3fanbus@gmail.com>
---
 .../intel/haswell/native_raminit/Makefile.mk  |   1 +
 .../haswell/native_raminit/raminit_main.c     |   1 +
 .../haswell/native_raminit/raminit_native.h   |  14 +
 .../haswell/native_raminit/reg_structs.h      |  13 +
 .../native_raminit/train_receive_enable.c     | 561 ++++++++++++++++++
 .../intel/haswell/registers/mchbar.h          |   3 +
 6 files changed, 593 insertions(+)
 create mode 100644 src/northbridge/intel/haswell/native_raminit/train_receive_enable.c

diff --git a/src/northbridge/intel/haswell/native_raminit/Makefile.mk b/src/northbridge/intel/haswell/native_raminit/Makefile.mk
index ebe9e9b762..e2fbfb4211 100644
--- a/src/northbridge/intel/haswell/native_raminit/Makefile.mk
+++ b/src/northbridge/intel/haswell/native_raminit/Makefile.mk
@@ -16,3 +16,4 @@ romstage-y += setup_wdb.c
 romstage-y += spd_bitmunching.c
 romstage-y += testing_io.c
 romstage-y += timings_refresh.c
+romstage-y += train_receive_enable.c
diff --git a/src/northbridge/intel/haswell/native_raminit/raminit_main.c b/src/northbridge/intel/haswell/native_raminit/raminit_main.c
index 5e4674957d..7d444659c3 100644
--- a/src/northbridge/intel/haswell/native_raminit/raminit_main.c
+++ b/src/northbridge/intel/haswell/native_raminit/raminit_main.c
@@ -60,6 +60,7 @@ static const struct task_entry cold_boot[] = {
 	{ configure_memory_map,                                   true, "MEMMAP",     },
 	{ do_jedec_init,                                          true, "JEDECINIT",  },
 	{ pre_training,                                           true, "PRETRAIN",   },
+	{ train_receive_enable,                                   true, "RCVET",      },
 };
 
 /* Return a generic stepping value to make stepping checks simpler */
diff --git a/src/northbridge/intel/haswell/native_raminit/raminit_native.h b/src/northbridge/intel/haswell/native_raminit/raminit_native.h
index 906b3143b9..b4e8c7de5a 100644
--- a/src/northbridge/intel/haswell/native_raminit/raminit_native.h
+++ b/src/northbridge/intel/haswell/native_raminit/raminit_native.h
@@ -42,6 +42,9 @@
 #define NUM_WDB_CL_MUX_SEEDS	3
 #define NUM_CADB_MUX_SEEDS	3
 
+/* Specified in PI ticks. 64 PI ticks == 1 qclk */
+#define tDQSCK_DRIFT		64
+
 /* ZQ calibration types */
 enum {
 	ZQ_INIT,	/* DDR3: ZQCL with tZQinit, LPDDR3: ZQ Init  with tZQinit  */
@@ -188,6 +191,7 @@ enum raminit_status {
 	RAMINIT_STATUS_MPLL_INIT_FAILURE,
 	RAMINIT_STATUS_POLL_TIMEOUT,
 	RAMINIT_STATUS_REUT_ERROR,
+	RAMINIT_STATUS_RCVEN_FAILURE,
 	RAMINIT_STATUS_UNSPECIFIED_ERROR, /** TODO: Deprecated in favor of specific values **/
 };
 
@@ -270,6 +274,10 @@ struct sysinfo {
 
 	union ddr_data_vref_adjust_reg dimm_vref;
 
+	uint8_t io_latency[NUM_CHANNELS][NUM_SLOTRANKS];
+	uint8_t rt_latency[NUM_CHANNELS][NUM_SLOTRANKS];
+	uint32_t rt_io_comp[NUM_CHANNELS];
+
 	uint32_t data_offset_train[NUM_CHANNELS][NUM_LANES];
 	uint32_t data_offset_comp[NUM_CHANNELS][NUM_LANES];
 
@@ -344,6 +352,11 @@ static inline void clear_data_offset_train_all(struct sysinfo *ctrl)
 	memset(ctrl->data_offset_train, 0, sizeof(ctrl->data_offset_train));
 }
 
+static inline uint32_t get_data_train_feedback(const uint8_t channel, const uint8_t byte)
+{
+	return mchbar_read32(DDR_DATA_TRAIN_FEEDBACK(channel, byte));
+}
+
 /* Number of ticks to wait in units of 69.841279 ns (citation needed) */
 static inline void tick_delay(const uint32_t delay)
 {
@@ -399,6 +412,7 @@ enum raminit_status convert_timings(struct sysinfo *ctrl);
 enum raminit_status configure_mc(struct sysinfo *ctrl);
 enum raminit_status configure_memory_map(struct sysinfo *ctrl);
 enum raminit_status do_jedec_init(struct sysinfo *ctrl);
+enum raminit_status train_receive_enable(struct sysinfo *ctrl);
 
 void configure_timings(struct sysinfo *ctrl);
 void configure_refresh(struct sysinfo *ctrl);
diff --git a/src/northbridge/intel/haswell/native_raminit/reg_structs.h b/src/northbridge/intel/haswell/native_raminit/reg_structs.h
index b943259b91..b099f4bb82 100644
--- a/src/northbridge/intel/haswell/native_raminit/reg_structs.h
+++ b/src/northbridge/intel/haswell/native_raminit/reg_structs.h
@@ -297,6 +297,19 @@ union ddr_scram_misc_control_reg {
 	uint32_t raw;
 };
 
+union sc_io_latency_reg {
+	struct __packed {
+		uint32_t iolat_rank0     : 4; // Bits  3:0
+		uint32_t iolat_rank1     : 4; // Bits  7:4
+		uint32_t iolat_rank2     : 4; // Bits 11:8
+		uint32_t iolat_rank3     : 4; // Bits 15:12
+		uint32_t rt_iocomp       : 6; // Bits 21:16
+		uint32_t                 : 9; // Bits 30:22
+		uint32_t dis_rt_clk_gate : 1; // Bits 31:31
+	};
+	uint32_t raw;
+};
+
 union mcscheds_cbit_reg {
 	struct __packed {
 		uint32_t dis_opp_cas    : 1; // Bits  0:0
diff --git a/src/northbridge/intel/haswell/native_raminit/train_receive_enable.c b/src/northbridge/intel/haswell/native_raminit/train_receive_enable.c
new file mode 100644
index 0000000000..576c6bc21e
--- /dev/null
+++ b/src/northbridge/intel/haswell/native_raminit/train_receive_enable.c
@@ -0,0 +1,561 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include <console/console.h>
+#include <northbridge/intel/haswell/haswell.h>
+#include <types.h>
+
+#include "raminit_native.h"
+#include "ranges.h"
+
+#define RCVEN_PLOT	RAM_DEBUG
+
+static enum raminit_status change_rcven_timing(struct sysinfo *ctrl, const uint8_t channel)
+{
+	int16_t max_rcven = -4096;
+	int16_t min_rcven = 4096;
+	int16_t max_rcven_rank[NUM_SLOTRANKS];
+	int16_t min_rcven_rank[NUM_SLOTRANKS];
+	for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
+		max_rcven_rank[rank] = max_rcven;
+		min_rcven_rank[rank] = min_rcven;
+	}
+	for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
+		if (!rank_in_ch(ctrl, rank, channel))
+			continue;
+
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+			int16_t new_rcven = ctrl->rcven[channel][rank][byte];
+			new_rcven -= ctrl->io_latency[channel][rank] * 64;
+			if (max_rcven_rank[rank] < new_rcven)
+				max_rcven_rank[rank] = new_rcven;
+
+			if (min_rcven_rank[rank] > new_rcven)
+				min_rcven_rank[rank] = new_rcven;
+		}
+		if (max_rcven < max_rcven_rank[rank])
+			max_rcven = max_rcven_rank[rank];
+
+		if (min_rcven > min_rcven_rank[rank])
+			min_rcven = min_rcven_rank[rank];
+	}
+
+	/*
+	 * Determine how far we are from the ideal center point for RcvEn timing.
+	 * (PiIdeal - AveRcvEn) / 64 is the ideal number of cycles we should have
+	 * for IO latency. command training will reduce this by 64, so plan for
+	 * that now in the ideal value. Round to closest integer.
+	 */
+	const int16_t rre_pi_ideal = 256 + 64;
+	const int16_t pi_reserve = 64;
+	const int16_t rcven_center = (max_rcven + min_rcven) / 2;
+	const int8_t iolat_target = DIV_ROUND_CLOSEST(rre_pi_ideal - rcven_center, 64);
+
+	int8_t io_g_offset = 0;
+	int8_t io_lat[NUM_SLOTRANKS] = { 0 };
+	for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
+		if (!rank_in_ch(ctrl, rank, channel))
+			continue;
+
+		io_lat[rank] = iolat_target;
+
+		/* Check for RcvEn underflow/overflow */
+		const int16_t rcven_lower = 64 * io_lat[rank] + min_rcven_rank[rank];
+		if (rcven_lower < pi_reserve)
+			io_lat[rank] += DIV_ROUND_UP(pi_reserve - rcven_lower, 64);
+
+		const int16_t rcven_upper = 64 * io_lat[rank] + max_rcven_rank[rank];
+		if (rcven_upper > 511 - pi_reserve)
+			io_lat[rank] -= DIV_ROUND_UP(rcven_upper - (511 - pi_reserve), 64);
+
+		/* Check for IO latency over/underflow */
+		if (io_lat[rank] - io_g_offset > 14)
+			io_g_offset = io_lat[rank] - 14;
+
+		if (io_lat[rank] - io_g_offset < 1)
+			io_g_offset = io_lat[rank] - 1;
+
+		const int8_t cycle_offset = io_lat[rank] - ctrl->io_latency[channel][rank];
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+			ctrl->rcven[channel][rank][byte] += 64 * cycle_offset;
+			update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
+		}
+	}
+
+	/* Calculate new IO comp latency */
+	union sc_io_latency_reg sc_io_lat = {
+		.raw = mchbar_read32(SC_IO_LATENCY_ch(channel)),
+	};
+
+	/* Check if we are underflowing or overflowing this field */
+	if (io_g_offset < 0 && sc_io_lat.rt_iocomp < -io_g_offset) {
+		printk(BIOS_ERR, "%s: IO COMP underflow\n", __func__);
+		printk(BIOS_ERR, "io_g_offset: %d\n", io_g_offset);
+		printk(BIOS_ERR, "rt_iocomp: %u\n", sc_io_lat.rt_iocomp);
+		return RAMINIT_STATUS_RCVEN_FAILURE;
+	}
+	if (io_g_offset > 0 && io_g_offset > 0x3f - sc_io_lat.rt_iocomp) {
+		printk(BIOS_ERR, "%s: IO COMP overflow\n", __func__);
+		printk(BIOS_ERR, "io_g_offset: %d\n", io_g_offset);
+		printk(BIOS_ERR, "rt_iocomp: %u\n", sc_io_lat.rt_iocomp);
+		return RAMINIT_STATUS_RCVEN_FAILURE;
+	}
+	sc_io_lat.rt_iocomp += io_g_offset;
+	ctrl->rt_io_comp[channel] = sc_io_lat.rt_iocomp;
+	for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
+		if (ctrl->rankmap[channel] & BIT(rank))
+			ctrl->io_latency[channel][rank] = io_lat[rank] - io_g_offset;
+
+		const uint8_t shift = rank * 4;
+		sc_io_lat.raw &= ~(0xf << shift);
+		sc_io_lat.raw |= ctrl->io_latency[channel][rank] << shift;
+	}
+	mchbar_write32(SC_IO_LATENCY_ch(channel), sc_io_lat.raw);
+	return RAMINIT_STATUS_SUCCESS;
+}
+
+#define RL_START (256 + 24)
+#define RL_STOP  (384 + 24)
+#define RL_STEP  8
+
+#define RE_NUM_SAMPLES	6
+
+static enum raminit_status verify_high_region(const int32_t center, const int32_t lwidth)
+{
+	if (center > RL_STOP) {
+		/* Check if center of high was found where it should be */
+		printk(BIOS_ERR, "RcvEn: Center of high (%d) higher than expected\n", center);
+		return RAMINIT_STATUS_RCVEN_FAILURE;
+	}
+	if (lwidth <= 32) {
+		/* Check if width is large enough */
+		printk(BIOS_ERR, "RcvEn: Width of high region (%d) too small\n", lwidth);
+		return RAMINIT_STATUS_RCVEN_FAILURE;
+	}
+	if (lwidth >= 96) {
+		/* Since we're calibrating a phase, a too large region is a problem */
+		printk(BIOS_ERR, "RcvEn: Width of high region (%d) too large\n", lwidth);
+		return RAMINIT_STATUS_RCVEN_FAILURE;
+	}
+	return RAMINIT_STATUS_SUCCESS;
+}
+
+static void program_io_latency(struct sysinfo *ctrl, const uint8_t channel, const uint8_t rank)
+{
+	const uint8_t shift = rank * 4;
+	const uint8_t iolat = ctrl->io_latency[channel][rank];
+	mchbar_clrsetbits32(SC_IO_LATENCY_ch(channel), 0xf << shift, iolat << shift);
+}
+
+static void program_rl_delays(struct sysinfo *ctrl, const uint8_t rank, const uint16_t rl_delay)
+{
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!rank_in_ch(ctrl, rank, channel))
+			continue;
+
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
+			update_rxt(ctrl, channel, rank, byte, RXT_RCVEN, rl_delay);
+	}
+}
+
+static bool sample_dqs(const uint8_t channel, const uint8_t byte)
+{
+	return (get_data_train_feedback(channel, byte) & 0x1ff) >= BIT(RE_NUM_SAMPLES - 1);
+}
+
+enum raminit_status train_receive_enable(struct sysinfo *ctrl)
+{
+	const struct reut_box reut_addr = {
+		.col = {
+			.start    = 0,
+			.stop     = 1023,
+			.inc_rate = 0,
+			.inc_val  = 1,
+		},
+	};
+	const struct wdb_pat wdb_pattern = {
+		.start_ptr  = 0,
+		.stop_ptr   = 9,
+		.inc_rate   = 32,
+		.dq_pattern = BASIC_VA,
+	};
+
+	const uint16_t bytemask = BIT(ctrl->lanes) - 1;
+	const uint8_t fine_step = 1;
+
+	const uint8_t rt_delta = is_hsw_ult() ? 4 : 2;
+	const uint8_t rt_io_comp = 21 + rt_delta;
+	const uint8_t rt_latency = 16 + rt_delta;
+	setup_io_test(
+		ctrl,
+		ctrl->chanmap,
+		PAT_RD,
+		2,
+		RE_NUM_SAMPLES + 1,
+		&reut_addr,
+		0,
+		&wdb_pattern,
+		0,
+		8);
+
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+			union ddr_data_control_2_reg data_control_2 = {
+				.raw = ctrl->dq_control_2[channel][byte],
+			};
+			data_control_2.force_rx_on = 1;
+			mchbar_write32(DQ_CONTROL_2(channel, byte), data_control_2.raw);
+		}
+		union ddr_data_control_0_reg data_control_0 = {
+			.raw = ctrl->dq_control_0[channel],
+		};
+		if (ctrl->lpddr) {
+			/**
+			 * W/A for b4618574 - @todo: remove for HSW ULT C0
+			 * Can't have force_odt_on together with leaker, disable LPDDR
+			 * mode during this training step. lpddr_mode is restored
+			 * at the end of this function from the host structure.
+			 */
+			data_control_0.lpddr_mode = 0;
+			mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
+		}
+		data_control_0.force_odt_on     = 1;
+		data_control_0.rl_training_mode = 1;
+		mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
+		mchbar_write32(SC_IO_LATENCY_ch(channel), (union sc_io_latency_reg) {
+			.rt_iocomp = rt_io_comp,
+		}.raw);
+	}
+	enum raminit_status status = RAMINIT_STATUS_SUCCESS;
+	for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
+		if (!does_rank_exist(ctrl, rank))
+			continue;
+
+		/*
+		 * Set initial roundtrip latency values. Assume -4 QCLK for worst board
+		 * layout. This is calculated as HW_ROUNDT_LAT_DEFAULT_VALUE plus:
+		 *
+		 *   DDR3: Default + (2 * tAA) + 4 QCLK + PI_CLK + N-mode value * 2
+		 * LPDDR3: Default + (2 * tAA) + 4 QCLK + PI_CLK + tDQSCK_max
+		 *
+		 * N-mode is 3 during training mode. Both channels use the same timings.
+		 */
+		/** TODO: differs for LPDDR **/
+		const uint32_t tmp = MAX(ctrl->multiplier, 4) + 5 + 2 * ctrl->tAA;
+		const uint32_t initial_rt_latency = MIN(rt_latency + tmp, 0x3f);
+
+		uint8_t chanmask = 0;
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			chanmask |= select_reut_ranks(ctrl, channel, BIT(rank));
+			if (!rank_in_ch(ctrl, rank, channel))
+				continue;
+
+			ctrl->io_latency[channel][rank] = 0;
+			mchbar_write8(SC_ROUNDT_LAT_ch(channel) + rank, initial_rt_latency);
+			ctrl->rt_latency[channel][rank] = initial_rt_latency;
+		}
+
+		printk(BIOS_DEBUG, "Rank %u\n", rank);
+		printk(BIOS_DEBUG, "Steps 1 and 2: Find middle of high region\n");
+		printk(RCVEN_PLOT, "Byte");
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (!rank_in_ch(ctrl, rank, channel))
+				continue;
+
+			printk(RCVEN_PLOT, "\t");
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
+				printk(RCVEN_PLOT, "%u ", byte);
+		}
+		printk(RCVEN_PLOT, "\nRcvEn\n");
+		struct phase_train_data region_data[NUM_CHANNELS][NUM_LANES] = { 0 };
+		for (uint16_t rl_delay = RL_START; rl_delay < RL_STOP; rl_delay += RL_STEP) {
+			printk(RCVEN_PLOT, " % 3d", rl_delay);
+			program_rl_delays(ctrl, rank, rl_delay);
+			run_io_test(ctrl, chanmask, BASIC_VA, true);
+			for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+				if (!rank_in_ch(ctrl, rank, channel))
+					continue;
+
+				printk(RCVEN_PLOT, "\t");
+				for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+					const bool high = sample_dqs(channel, byte);
+					printk(RCVEN_PLOT, high ? ". " : "# ");
+					phase_record_pass(
+						&region_data[channel][byte],
+						high,
+						rl_delay,
+						RL_START,
+						RL_STEP);
+				}
+			}
+			printk(RCVEN_PLOT, "\n");
+		}
+		printk(RCVEN_PLOT, "\n");
+		printk(BIOS_DEBUG, "Update RcvEn timing to be in the center of high region\n");
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (!rank_in_ch(ctrl, rank, channel))
+				continue;
+
+			printk(BIOS_DEBUG, "C%u.R%u: \tLeft\tRight\tWidth\tCenter\n",
+				channel, rank);
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+				struct phase_train_data *const curr_data =
+						&region_data[channel][byte];
+				phase_append_current_to_initial(curr_data, RL_START, RL_STEP);
+				const int32_t lwidth = range_width(curr_data->largest);
+				const int32_t center = range_center(curr_data->largest);
+				printk(BIOS_DEBUG, "   B%u: \t%d\t%d\t%d\t%d\n",
+					byte,
+					curr_data->largest.start,
+					curr_data->largest.end,
+					lwidth,
+					center);
+
+				status = verify_high_region(center, lwidth);
+				if (status) {
+					printk(BIOS_ERR,
+						"RcvEn problems on channel %u, byte %u\n",
+						channel, byte);
+					goto clean_up;
+				}
+				ctrl->rcven[channel][rank][byte] = center;
+				update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
+			}
+			printk(BIOS_DEBUG, "\n");
+		}
+
+		printk(BIOS_DEBUG, "Step 3: Quarter preamble - Walk backwards\n");
+		printk(RCVEN_PLOT, "Byte");
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (!rank_in_ch(ctrl, rank, channel))
+				continue;
+
+			printk(RCVEN_PLOT, "\t");
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
+				printk(RCVEN_PLOT, "%u ", byte);
+		}
+		printk(RCVEN_PLOT, "\nIOLAT\n");
+		bool done = false;
+		while (!done) {
+			run_io_test(ctrl, chanmask, BASIC_VA, true);
+			done = true;
+			for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+				if (!rank_in_ch(ctrl, rank, channel))
+					continue;
+
+				printk(RCVEN_PLOT, "  %2u\t", ctrl->io_latency[channel][rank]);
+				uint16_t highs = 0;
+				for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+					const bool high = sample_dqs(channel, byte);
+					printk(RCVEN_PLOT, high ? "H " : "L ");
+					if (high)
+						highs |= BIT(byte);
+				}
+				if (!highs)
+					continue;
+
+				done = false;
+
+				/* If all bytes sample high, adjust timing globally */
+				if (highs == bytemask && ctrl->io_latency[channel][rank] < 14) {
+					ctrl->io_latency[channel][rank] += 2;
+					ctrl->io_latency[channel][rank] %= 16;
+					program_io_latency(ctrl, channel, rank);
+					continue;
+				}
+
+				/* Otherwise, adjust individual bytes */
+				for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+					if (!(highs & BIT(byte)))
+						continue;
+
+					if (ctrl->rcven[channel][rank][byte] < 128) {
+						printk(BIOS_ERR,
+							"RcvEn underflow: walking backwards\n");
+						printk(BIOS_ERR,
+							"For channel %u, rank %u, byte %u\n",
+							channel, rank, byte);
+						status = RAMINIT_STATUS_RCVEN_FAILURE;
+						goto clean_up;
+					}
+					ctrl->rcven[channel][rank][byte] -= 128;
+					update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
+				}
+			}
+			printk(RCVEN_PLOT, "\n");
+		}
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (!rank_in_ch(ctrl, rank, channel))
+				continue;
+
+			printk(BIOS_DEBUG, "\nC%u:  Preamble\n", channel);
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+				printk(BIOS_DEBUG,
+					" B%u: %u\n", byte, ctrl->rcven[channel][rank][byte]);
+			}
+		}
+		printk(BIOS_DEBUG, "\n");
+
+		printk(BIOS_DEBUG, "Step 4: Add 1 qclk\n");
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (!rank_in_ch(ctrl, rank, channel))
+				continue;
+
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+				ctrl->rcven[channel][rank][byte] += 64;
+				update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
+			}
+		}
+		printk(BIOS_DEBUG, "\n");
+
+		printk(BIOS_DEBUG, "Step 5: Walk forward to find rising edge\n");
+		printk(RCVEN_PLOT, "Byte");
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (!rank_in_ch(ctrl, rank, channel))
+				continue;
+
+			printk(RCVEN_PLOT, "\t");
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++)
+				printk(RCVEN_PLOT, "%u ", byte);
+		}
+		printk(RCVEN_PLOT, "\n inc\n");
+		uint16_t ch_result[NUM_CHANNELS] = { 0 };
+		uint8_t inc_preamble[NUM_CHANNELS][NUM_LANES] = { 0 };
+		for (uint8_t inc = 0; inc < 64; inc += fine_step) {
+			printk(RCVEN_PLOT, " %2u\t", inc);
+			run_io_test(ctrl, chanmask, BASIC_VA, true);
+			done = true;
+			for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+				if (!rank_in_ch(ctrl, rank, channel))
+					continue;
+
+				for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+					if (ch_result[channel] & BIT(byte)) {
+						/* Skip bytes that are already done */
+						printk(RCVEN_PLOT, ". ");
+						continue;
+					}
+					const bool pass = sample_dqs(channel, byte);
+					printk(RCVEN_PLOT, pass ? ". " : "# ");
+					if (pass) {
+						ch_result[channel] |= BIT(byte);
+						continue;
+					}
+					ctrl->rcven[channel][rank][byte] += fine_step;
+					update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
+					inc_preamble[channel][byte] = inc;
+				}
+				printk(RCVEN_PLOT, "\t");
+				if (ch_result[channel] != bytemask)
+					done = false;
+			}
+			printk(RCVEN_PLOT, "\n");
+			if (done)
+				break;
+		}
+		printk(BIOS_DEBUG, "\n");
+		if (!done) {
+			printk(BIOS_ERR, "Error: Preamble edge not found for all bytes\n");
+			printk(BIOS_ERR, "The final RcvEn results are as follows:\n");
+			for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+				if (!rank_in_ch(ctrl, rank, channel))
+					continue;
+
+				printk(BIOS_ERR, "Channel %u Rank %u:  preamble\n",
+					channel, rank);
+				for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+					printk(BIOS_ERR, " Byte %u: %u%s\n", byte,
+						ctrl->rcven[channel][rank][byte],
+						(ch_result[channel] ^ bytemask) & BIT(byte)
+							? ""
+							: " *** Check this byte! ***");
+				}
+			}
+			status = RAMINIT_STATUS_RCVEN_FAILURE;
+			goto clean_up;
+		}
+
+		printk(BIOS_DEBUG, "Step 6: center on preamble and clean up rank\n");
+		for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+			if (!rank_in_ch(ctrl, rank, channel))
+				continue;
+
+			printk(BIOS_DEBUG, "C%u:  Preamble increment\n", channel);
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+				/*
+				 * For Traditional, pull in RcvEn by 64. For ULT, take the DQS
+				 * drift into account to the specified guardband: tDQSCK_DRIFT.
+				 */
+				ctrl->rcven[channel][rank][byte] -= tDQSCK_DRIFT;
+				update_rxt(ctrl, channel, rank, byte, RXT_RESTORE, 0);
+				printk(BIOS_DEBUG, " B%u: %u      %u\n", byte,
+					ctrl->rcven[channel][rank][byte],
+					inc_preamble[channel][byte]);
+			}
+			printk(BIOS_DEBUG, "\n");
+		}
+		printk(BIOS_DEBUG, "\n");
+	}
+
+clean_up:
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		if (ctrl->lpddr) {
+			/**
+			 * W/A for b4618574 - @todo: remove for HSW ULT C0
+			 * Can't have force_odt_on together with leaker, disable LPDDR mode for
+			 * this training step. This write will disable force_odt_on while still
+			 * keeping LPDDR mode disabled. Second write will restore LPDDR mode.
+			 */
+			union ddr_data_control_0_reg data_control_0 = {
+				.raw = ctrl->dq_control_0[channel],
+			};
+			data_control_0.lpddr_mode = 0;
+			mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), data_control_0.raw);
+		}
+		mchbar_write32(DDR_DATA_ch_CONTROL_0(channel), ctrl->dq_control_0[channel]);
+		for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+			mchbar_write32(DQ_CONTROL_2(channel, byte),
+					ctrl->dq_control_2[channel][byte]);
+		}
+	}
+	io_reset();
+	if (status)
+		return status;
+
+	printk(BIOS_DEBUG, "Step 7: Sync IO latency across all ranks\n");
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		status = change_rcven_timing(ctrl, channel);
+		if (status)
+			return status;
+	}
+	printk(BIOS_DEBUG, "\nFinal Receive Enable and IO latency settings:\n");
+	for (uint8_t channel = 0; channel < NUM_CHANNELS; channel++) {
+		if (!does_ch_exist(ctrl, channel))
+			continue;
+
+		for (uint8_t rank = 0; rank < NUM_SLOTRANKS; rank++) {
+			if (!rank_in_ch(ctrl, rank, channel))
+				continue;
+
+			const union sc_io_latency_reg sc_io_latency = {
+				.raw = mchbar_read32(SC_IO_LATENCY_ch(channel)),
+			};
+			printk(BIOS_DEBUG, "  C%u.R%u: IOLAT = %u  rt_iocomp = %u\n", channel,
+				rank, ctrl->io_latency[channel][rank], sc_io_latency.rt_iocomp);
+			for (uint8_t byte = 0; byte < ctrl->lanes; byte++) {
+				printk(BIOS_DEBUG, "   B%u:   %u\n", byte,
+					ctrl->rcven[channel][rank][byte]);
+			}
+			printk(BIOS_DEBUG, "\n");
+		}
+	}
+	return status;
+}
diff --git a/src/northbridge/intel/haswell/registers/mchbar.h b/src/northbridge/intel/haswell/registers/mchbar.h
index a81559bb1e..9172d4f2b0 100644
--- a/src/northbridge/intel/haswell/registers/mchbar.h
+++ b/src/northbridge/intel/haswell/registers/mchbar.h
@@ -18,6 +18,8 @@
 #define RX_TRAIN_ch_r_b(ch, rank, byte)		_DDRIO_C_R_B(0x0000, ch, rank, byte)
 #define TX_TRAIN_ch_r_b(ch, rank, byte)		_DDRIO_C_R_B(0x0020, ch, rank, byte)
 
+#define DDR_DATA_TRAIN_FEEDBACK(ch, byte)	_DDRIO_C_R_B(0x0054, ch, 0, byte)
+
 #define DQ_CONTROL_2(ch, byte)			_DDRIO_C_R_B(0x0064, ch, 0, byte)
 #define DQ_CONTROL_0(ch, byte)			_DDRIO_C_R_B(0x0074, ch, 0, byte)
 
@@ -100,6 +102,7 @@
 #define COMMAND_RATE_LIMIT_ch(ch)		_MCMAIN_C(0x4010, ch)
 #define TC_BANK_RANK_D_ch(ch)			_MCMAIN_C(0x4014, ch)
 #define SC_ROUNDT_LAT_ch(ch)			_MCMAIN_C(0x4024, ch)
+#define SC_IO_LATENCY_ch(ch)			_MCMAIN_C(0x4028, ch)
 
 #define REUT_ch_PAT_WDB_CL_MUX_CFG(ch)		_MCMAIN_C(0x4040, ch)
 
-- 
2.39.2