[PATCH v2 6/9] ram: octeon: Add MIPS Octeon3 DDR4 support (part 2/3)

Daniel Schwierzeck daniel.schwierzeck at gmail.com
Wed Aug 19 16:47:27 CEST 2020


Am Montag, den 17.08.2020, 14:12 +0200 schrieb Stefan Roese:
> From: Aaron Williams <awilliams at marvell.com>
> 
> This Octeon 3 DDR driver is ported from the 2013 Cavium / Marvell U-Boot
> repository. It currently supports DDR4 on Octeon 3. It can be later
> extended to support also DDR3 and Octeon 2 platforms.
> 
> Part 2 includes the very complex Octeon 3 DDR4 configuration
> 
> Signed-off-by: Aaron Williams <awilliams at marvell.com>
> Signed-off-by: Stefan Roese <sr at denx.de>
> ---
> 
> (no changes since v1)
> 
>  drivers/ram/octeon/octeon3_lmc.c | 11484 +++++++++++++++++++++++++++++
>  1 file changed, 11484 insertions(+)
>  create mode 100644 drivers/ram/octeon/octeon3_lmc.c

some general notes:
- there are a lot of C++ style comments
- there are some internal configuration variables and according code
which is not built. Can't you remove that to cut down the code size? Or
convert to Kconfig symbols if the code is needed.

> 
> diff --git a/drivers/ram/octeon/octeon3_lmc.c b/drivers/ram/octeon/octeon3_lmc.c
> new file mode 100644
> index 0000000000..21e331897b
> --- /dev/null
> +++ b/drivers/ram/octeon/octeon3_lmc.c
> @@ -0,0 +1,11484 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2020 Marvell International Ltd.
> + *
> + * https://spdx.org/licenses

this line is superfluous

> + */
> +
> +#include <command.h>

do you really need this?

> +#include <dm.h>
> +#include <hang.h>
> +#include <i2c.h>
> +#include <ram.h>
> +#include <time.h>
> +
> +#include <linux/bitops.h>
> +#include <linux/io.h>
> +
> +#include <mach/octeon_ddr.h>
> +
> +/* Random number generator stuff */
> +
> +#define CVMX_RNM_CTL_STATUS	0x0001180040000000
> +#define CVMX_OCT_DID_RNG	8ULL
> +
> +static u64 cvmx_build_io_address(u64 major_did, u64 sub_did)
> +{
> +	return ((0x1ull << 48) | (major_did << 43) | (sub_did << 40));
> +}
> +
> +static u64 cvmx_rng_get_random64(void)
> +{
> +	return csr_rd(cvmx_build_io_address(CVMX_OCT_DID_RNG, 0));
> +}
> +
> +static void cvmx_rng_enable(void)
> +{
> +	u64 val;
> +
> +	val = csr_rd(CVMX_RNM_CTL_STATUS);
> +	val |= BIT(0) | BIT(1);
> +	csr_wr(CVMX_RNM_CTL_STATUS, val);
> +}
> +
> +#define CAVIUM_ONLY
> +
> +// FIXME: set to 1 for development-level messages, 0 for production
> +#define RLEVEL_PRINTALL_DEFAULT 1
> +#define WLEVEL_PRINTALL_DEFAULT 1
> +
> +#define ALLOW_BY_RANK_INIT       1
> +#define USE_ORIG_TEST_DRAM_BYTE  0
> +
> +#define RLEXTRAS_PATCH     1	// write to unused RL rank entries
> +#define WLEXTRAS_PATCH     1	// write to unused WL rank entries
> +#define ADD_48_OHM_SKIP    0
> +#define NOSKIP_40_48_OHM   1
> +#define NOSKIP_48_STACKED  1
> +#define NOSKIP_FOR_MINI    1
> +#define NOSKIP_FOR_2S_1R   1
> +
> +// define how many HW RL samples per rank to take
> +// multiple samples will allow looking for the best sample score
> +//
> +#define RLEVEL_SAMPLES_DEFAULT     3
> +
> +#define FAILSAFE_CHECK           1
> +
> +#define PERFECT_BITMASK_COUNTING 1
> +#define PRINT_PERFECT_COUNTS     PERFECT_BITMASK_COUNTING && 1
> +
> +#define DISABLE_SW_WL_PASS_2  1
> +#define SW_WL_CHECK_PATCH     1
> +#define HW_WL_MAJORITY        1
> +// try HW WL base alternate if available when SW WL fails
> +#define SWL_TRY_HWL_ALT       HW_WL_MAJORITY && 1
> +// define how many HW WL samples to take for majority voting
> +// MUST BE odd!!
> +// assume there should only be 2 possible values that will show up,
> +// so treat ties as a problem!!!
> +// NOTE: do not change this without checking the code!!!
> +#define WLEVEL_LOOPS_DEFAULT     5
> +
> +#define SWL_WITH_HW_ALTS_CHOOSE_SW 0	// FIXME: allow override?
> +
> +#define SW_WLEVEL_HW_DEFAULT 1
> +// collect and print LMC utilization using SWL software algorithm
> +#define ENABLE_SW_WLEVEL_UTILIZATION 1
> +
> +#define COUNT_RL_CANDIDATES 1
> +
> +#define DDR3_DAC_OVERRIDE  1
> +
> +#define DEFAULT_BEST_RANK_SCORE  9999999
> +#define MAX_RANK_SCORE_LIMIT     99	// FIXME?
> +
> +#define ENABLE_COMPUTED_VREF_ADJUSTMENT 1
> +
> +#undef ENABLE_SLOT_CTL_ACCESS
> +
> +#undef DEBUG_PERFORM_DDR3_SEQUENCE
> +#define ddr_seq_print(format, ...) do {} while (0)
> +
> +struct wlevel_bitcnt {
> +	int bitcnt[4];
> +};
> +
> +static void display_dac_dbi_settings(int lmc, int dac_or_dbi,
> +				     int ecc_ena, int *settings, char *title);
> +
> +static unsigned short load_dac_override(struct ddr_priv *priv, int if_num,
> +					int dac_value, int byte);
> +
> +// "mode" arg
> +#define DBTRAIN_TEST 0
> +#define DBTRAIN_DBI  1
> +#define DBTRAIN_LFSR 2
> +
> +static int run_best_hw_patterns(struct ddr_priv *priv, int lmc, u64 phys_addr,
> +				int mode, u64 *xor_data);
> +
> +#define LMC_DDR3_RESET_ASSERT   0
> +#define LMC_DDR3_RESET_DEASSERT 1
> +
> +static void cn7xxx_lmc_ddr3_reset(struct ddr_priv *priv, int if_num, int reset)
> +{
> +	union cvmx_lmcx_reset_ctl reset_ctl;
> +
> +	/*
> +	 * 4. Deassert DDRn_RESET_L pin by writing
> +	 *    LMC(0..3)_RESET_CTL[DDR3RST] = 1
> +	 *    without modifying any other LMC(0..3)_RESET_CTL fields.
> +	 * 5. Read LMC(0..3)_RESET_CTL and wait for the result.
> +	 * 6. Wait a minimum of 500us. This guarantees the necessary T = 500us
> +	 *    delay between DDRn_RESET_L deassertion and DDRn_DIMM*_CKE*
> +	 *    assertion.
> +	 */
> +	debug("LMC%d %s DDR_RESET_L\n", if_num,
> +	      (reset ==
> +	       LMC_DDR3_RESET_DEASSERT) ? "De-asserting" : "Asserting");
> +
> +	reset_ctl.u64 = lmc_rd(priv, CVMX_LMCX_RESET_CTL(if_num));
> +	reset_ctl.cn78xx.ddr3rst = reset;
> +	lmc_wr(priv, CVMX_LMCX_RESET_CTL(if_num), reset_ctl.u64);
> +
> +	lmc_rd(priv, CVMX_LMCX_RESET_CTL(if_num));
> +
> +	udelay(500);
> +}
> +
> +static void perform_lmc_reset(struct ddr_priv *priv, int node, int if_num)
> +{
> +	/*
> +	 * 5.9.6 LMC RESET Initialization
> +	 *
> +	 * The purpose of this step is to assert/deassert the RESET# pin at the
> +	 * DDR3/DDR4 parts.
> +	 *
> +	 * This LMC RESET step is done for all enabled LMCs.
> +	 *
> +	 * It may be appropriate to skip this step if the DDR3/DDR4 DRAM parts
> +	 * are in self refresh and are currently preserving their
> +	 * contents. (Software can determine this via
> +	 * LMC(0..3)_RESET_CTL[DDR3PSV] in some circumstances.) The remainder of
> +	 * this section assumes that the DRAM contents need not be preserved.
> +	 *
> +	 * The remainder of this section assumes that the CN78XX DDRn_RESET_L
> +	 * pin is attached to the RESET# pin of the attached DDR3/DDR4 parts,
> +	 * as will be appropriate in many systems.
> +	 *
> +	 * (In other systems, such as ones that can preserve DDR3/DDR4 part
> +	 * contents while CN78XX is powered down, it will not be appropriate to
> +	 * directly attach the CN78XX DDRn_RESET_L pin to DRESET# of the
> +	 * DDR3/DDR4 parts, and this section may not apply.)
> +	 *
> +	 * The remainder of this section describes the sequence for LMCn.
> +	 *
> +	 * Perform the following six substeps for LMC reset initialization:
> +	 *
> +	 * 1. If not done already, assert DDRn_RESET_L pin by writing
> +	 * LMC(0..3)_RESET_ CTL[DDR3RST] = 0 without modifying any other
> +	 * LMC(0..3)_RESET_CTL fields.
> +	 */
> +
> +	if (!ddr_memory_preserved(priv)) {
> +		/*
> +		 * 2. Read LMC(0..3)_RESET_CTL and wait for the result.
> +		 */
> +
> +		lmc_rd(priv, CVMX_LMCX_RESET_CTL(if_num));
> +
> +		/*
> +		 * 3. Wait until RESET# assertion-time requirement from JEDEC
> +		 * DDR3/DDR4 specification is satisfied (200 us during a
> +		 * power-on ramp, 100ns when power is already stable).
> +		 */
> +
> +		udelay(200);
> +
> +		/*
> +		 * 4. Deassert DDRn_RESET_L pin by writing
> +		 *    LMC(0..3)_RESET_CTL[DDR3RST] = 1
> +		 *    without modifying any other LMC(0..3)_RESET_CTL fields.
> +		 * 5. Read LMC(0..3)_RESET_CTL and wait for the result.
> +		 * 6. Wait a minimum of 500us. This guarantees the necessary
> +		 *    T = 500us delay between DDRn_RESET_L deassertion and
> +		 *    DDRn_DIMM*_CKE* assertion.
> +		 */
> +		cn7xxx_lmc_ddr3_reset(priv, if_num, LMC_DDR3_RESET_DEASSERT);
> +
> +		/* Toggle Reset Again */
> +		/* That is, assert, then de-assert, one more time */
> +		cn7xxx_lmc_ddr3_reset(priv, if_num, LMC_DDR3_RESET_ASSERT);
> +		cn7xxx_lmc_ddr3_reset(priv, if_num, LMC_DDR3_RESET_DEASSERT);
> +	}
> +}
> +
> +void oct3_ddr3_seq(struct ddr_priv *priv, int rank_mask, int if_num,
> +		   int sequence)
> +{
> +	/*
> +	 * 3. Without changing any other fields in LMC(0)_CONFIG, write
> +	 *    LMC(0)_CONFIG[RANKMASK] then write both
> +	 *    LMC(0)_SEQ_CTL[SEQ_SEL,INIT_START] = 1 with a single CSR write
> +	 *    operation. LMC(0)_CONFIG[RANKMASK] bits should be set to indicate
> +	 *    the ranks that will participate in the sequence.
> +	 *
> +	 *    The LMC(0)_SEQ_CTL[SEQ_SEL] value should select power-up/init or
> +	 *    selfrefresh exit, depending on whether the DRAM parts are in
> +	 *    self-refresh and whether their contents should be preserved. While
> +	 *    LMC performs these sequences, it will not perform any other DDR3
> +	 *    transactions. When the sequence is complete, hardware sets the
> +	 *    LMC(0)_CONFIG[INIT_STATUS] bits for the ranks that have been
> +	 *    initialized.
> +	 *
> +	 *    If power-up/init is selected immediately following a DRESET
> +	 *    assertion, LMC executes the sequence described in the "Reset and
> +	 *    Initialization Procedure" section of the JEDEC DDR3
> +	 *    specification. This includes activating CKE, writing all four DDR3
> +	 *    mode registers on all selected ranks, and issuing the required
> +	 *    ZQCL
> +	 *    command. The LMC(0)_CONFIG[RANKMASK] value should select all ranks
> +	 *    with attached DRAM in this case. If LMC(0)_CONTROL[RDIMM_ENA] = 1,
> +	 *    LMC writes the JEDEC standard SSTE32882 control words selected by
> +	 *    LMC(0)_DIMM_CTL[DIMM*_WMASK] between DDR_CKE* signal assertion and
> +	 *    the first DDR3 mode register write operation.
> +	 *    LMC(0)_DIMM_CTL[DIMM*_WMASK] should be cleared to 0 if the
> +	 *    corresponding DIMM is not present.
> +	 *
> +	 *    If self-refresh exit is selected, LMC executes the required SRX
> +	 *    command followed by a refresh and ZQ calibration. Section 4.5
> +	 *    describes behavior of a REF + ZQCS.  LMC does not write the DDR3
> +	 *    mode registers as part of this sequence, and the mode register
> +	 *    parameters must match at self-refresh entry and exit times.
> +	 *
> +	 * 4. Read LMC(0)_SEQ_CTL and wait for LMC(0)_SEQ_CTL[SEQ_COMPLETE]
> +	 *    to be set.
> +	 *
> +	 * 5. Read LMC(0)_CONFIG[INIT_STATUS] and confirm that all ranks have
> +	 *    been initialized.
> +	 */
> +
> +	union cvmx_lmcx_seq_ctl seq_ctl;
> +	union cvmx_lmcx_config lmc_config;
> +	int timeout;
> +
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	lmc_config.s.rankmask = rank_mask;
> +	lmc_wr(priv, CVMX_LMCX_CONFIG(if_num), lmc_config.u64);
> +
> +	seq_ctl.u64 = 0;
> +
> +	seq_ctl.s.init_start = 1;
> +	seq_ctl.s.seq_sel = sequence;
> +
> +	ddr_seq_print
> +	    ("Performing LMC sequence: rank_mask=0x%02x, sequence=0x%x, %s\n",
> +	     rank_mask, sequence, sequence_str[sequence]);
> +
> +	if (seq_ctl.s.seq_sel == 3)
> +		debug("LMC%d: Exiting Self-refresh Rank_mask:%x\n", if_num,
> +		      rank_mask);
> +
> +	lmc_wr(priv, CVMX_LMCX_SEQ_CTL(if_num), seq_ctl.u64);
> +	lmc_rd(priv, CVMX_LMCX_SEQ_CTL(if_num));
> +
> +	timeout = 100;
> +	do {
> +		udelay(100);	/* Wait a while */
> +		seq_ctl.u64 = lmc_rd(priv, CVMX_LMCX_SEQ_CTL(if_num));
> +		if (--timeout == 0) {
> +			printf("Sequence %d timed out\n", sequence);
> +			break;
> +		}
> +	} while (seq_ctl.s.seq_complete != 1);
> +
> +	ddr_seq_print("           LMC sequence=%x: Completed.\n", sequence);
> +}
> +
> +#if USE_ORIG_TEST_DRAM_BYTE
> +
> +#define DO_LIKE_RANDOM_XOR 1
> +
> +static int test_dram_byte64(struct ddr_priv *priv, int if_num, u64 p,
> +			    u64 bitmask, u64 *xor_data)
> +{
> +	u64 p1, p2, d1, d2;
> +	u64 v, v1;
> +	u64 p2offset = 0x4000000;
> +	u64 datamask;
> +	u64 xor;
> +	int i, j, k;
> +	int errors = 0;
> +	int index;
> +	int bitno = (octeon_is_cpuid(OCTEON_CN7XXX)) ? 20 : 18;
> +#if DO_LIKE_RANDOM_XOR
> +	u64 pattern1 = cvmx_rng_get_random64();
> +	u64 this_pattern;
> +#endif
> +	u64 bad_bits[2] = { 0, 0 };
> +	int node = 0;
> +
> +	if (octeon_is_cpuid(OCTEON_CN73XX) || octeon_is_cpuid(OCTEON_CNF75XX))
> +		bitno = 18;
> +
> +	// When doing in parallel, the caller must provide full 8-byte bitmask.
> +	// Byte lanes may be clear in the mask to indicate no testing on that
> +	// lane.
> +	datamask = bitmask;
> +
> +	// final address must include LMC and node
> +	p |= (if_num << 7);	/* Map address into proper interface */
> +	p |= (u64)node << CVMX_NODE_MEM_SHIFT;	// map to node
> +	p |= 1ull << 63;
> +
> +	/* Add offset to both test regions to not clobber boot stuff
> +	 * when running from L2.
> +	 */
> +	p += 0x4000000;
> +
> +	debug("N%d.LMC%d: %s: phys_addr=0x%lx/0x%lx (0x%lx)\n",
> +	      node, if_num, __func__, p, p + p2offset, 1ULL << bitno);
> +
> +	/*
> +	 * The loop ranges and increments walk through a range of addresses
> +	 * avoiding bits that alias to different memory interfaces (LMCs)
> +	 * on the CN88XX; ie we want to limit activity to a single memory
> +	 * channel.
> +	 */
> +
> +	/* Store something into each location first */
> +	// NOTE: the ordering of loops is purposeful
> +	for (i = 0; i < (1 << 7); i += 8) {
> +		for (k = 0; k < (1 << bitno); k += (1 << 14)) {
> +			for (j = 0; j < (1 << 12); j += (1 << 9)) {
> +				index = i + j + k;
> +				p1 = p + index;
> +				p2 = p1 + p2offset;
> +
> +#if DO_LIKE_RANDOM_XOR
> +				v = pattern1 * p1;
> +				v1 = v;	// write the same thing to both areas
> +#else
> +				v = 0ULL;
> +				v1 = v;
> +#endif
> +				cvmx_write64_uint64(p1, v);
> +				cvmx_write64_uint64(p2, v1);
> +
> +				/* Write back and invalidate the cache lines
> +				 *
> +				 * For OCX we cannot limit the number of L2 ways
> +				 * so instead we just write back and invalidate
> +				 * the L2 cache lines.  This is not possible
> +				 * when booting remotely, however so this is
> +				 * only enabled for U-Boot right now.
> +				 * Potentially the BDK can also take advantage
> +				 * of this.
> +				 */
> +				CVMX_CACHE_WBIL2(p1, 0);
> +				CVMX_CACHE_WBIL2(p2, 0);
> +			}
> +		}
> +	}
> +	CVMX_DCACHE_INVALIDATE;

can't you use the generic cache API now because you already have a
customized implementation? 

> +
> +#if DO_LIKE_RANDOM_XOR
> +	this_pattern = cvmx_rng_get_random64();
> +#endif
> +
> +	// modify the contents of each location in some way
> +	// NOTE: the ordering of loops is purposeful
> +	for (i = 0; i < (1 << 7); i += 8) {
> +		for (k = 0; k < (1 << bitno); k += (1 << 14)) {
> +			for (j = 0; j < (1 << 12); j += (1 << 9)) {
> +				index = i + j + k;
> +				p1 = p + index;
> +				p2 = p1 + p2offset;
> +#if DO_LIKE_RANDOM_XOR
> +				v = cvmx_read64_uint64(p1) ^ this_pattern;
> +				v1 = cvmx_read64_uint64(p2) ^ this_pattern;
> +#else
> +				v = test_pattern[index %
> +						 (sizeof(test_pattern) /
> +						  sizeof(u64))];
> +				v &= datamask;
> +				v1 = ~v;
> +#endif
> +
> +				cvmx_write64_uint64(p1, v);
> +				cvmx_write64_uint64(p2, v1);
> +
> +				/*
> +				 * Write back and invalidate the cache lines
> +				 *
> +				 * For OCX we cannot limit the number of L2 ways
> +				 * so instead we just write back and invalidate
> +				 * the L2 cache lines.  This is not possible
> +				 * when booting remotely, however so this is
> +				 * only enabled for U-Boot right now.
> +				 * Potentially the BDK can also take advantage
> +				 * of this.
> +				 */
> +				CVMX_CACHE_WBIL2(p1, 0);
> +				CVMX_CACHE_WBIL2(p2, 0);
> +			}
> +		}
> +	}
> +	CVMX_DCACHE_INVALIDATE;
> +
> +	// test the contents of each location by predicting what should be there
> +	// NOTE: the ordering of loops is purposeful: test full cachelines to
> +	//       detect an error occurring in any slot thereof
> +	for (k = 0; k < (1 << bitno); k += (1 << 14)) {
> +		for (j = 0; j < (1 << 12); j += (1 << 9)) {
> +			for (i = 0; i < (1 << 7); i += 8) {
> +				int bybit = 1;
> +				u64 bymsk = 0xffULL;	// start in byte lane 0
> +
> +				index = i + j + k;
> +				p1 = p + index;
> +				p2 = p1 + p2offset;
> +#if DO_LIKE_RANDOM_XOR
> +				// FIXME: this should predict what we find...???
> +				v = (p1 * pattern1) ^ this_pattern;
> +				d1 = cvmx_read64_uint64(p1);
> +				d2 = cvmx_read64_uint64(p2);
> +#else
> +				v = test_pattern[index %
> +						 (sizeof(test_pattern) /
> +						  sizeof(u64))];
> +				d1 = cvmx_read64_uint64(p1);
> +				d2 = ~cvmx_read64_uint64(p2);
> +#endif
> +
> +				// union of error bits only in active byte lanes
> +				xor = ((d1 ^ v) | (d2 ^ v)) & datamask;
> +
> +				if (!xor)
> +					continue;
> +
> +				// accumulate bad bits
> +				bad_bits[0] |= xor;
> +
> +				while (xor != 0) {
> +					debug("ERROR: [0x%016llX] [0x%016llX]  expected 0x%016llX xor %016llX\n",
> +					      p1, p2, v, xor);
> +					// error(s) in this lane
> +					if (xor & bymsk) {
> +						// set the byte error bit
> +						errors |= bybit;
> +						// clear byte lane in error bits
> +						xor &= ~bymsk;
> +						// clear the byte lane in the
> +						// mask
> +						datamask &= ~bymsk;
> +						// nothing left to do
> +						if (datamask == 0) {
> +							// completely done when
> +							// errors found in all
> +							// byte lanes in
> +							// datamask
> +							goto done_now;
> +						}
> +					}
> +					// move mask into next byte lane
> +					bymsk <<= 8;
> +					// move bit into next byte position
> +					bybit <<= 1;
> +				}
> +			}
> +		}
> +	}
> +
> +done_now:
> +	if (xor_data) {		// send the bad bits back...
> +		xor_data[0] = bad_bits[0];
> +		xor_data[1] = bad_bits[1];	// let it be zeroed
> +	}
> +	return errors;
> +}
> +
> +#else /* USE_ORIG_TEST_DRAM_BYTE */
> +
> +#define bdk_numa_get_address(n, p)	((p) | ((u64)n) << CVMX_NODE_MEM_SHIFT)
> +#define AREA_BASE_OFFSET		BIT_ULL(26)
> +
> +static int test_dram_byte64(struct ddr_priv *priv, int lmc, u64 p,
> +			    u64 bitmask, u64 *xor_data)
> +{
> +	u64 p1, p2, d1, d2;
> +	u64 v, v1;
> +	u64 p2offset = (1ULL << 26);	// offset to area 2
> +	u64 datamask;
> +	u64 xor;
> +	u64 i, j, k;
> +	u64 ii;
> +	int errors = 0;
> +	//u64 index;
> +	u64 pattern1 = cvmx_rng_get_random64();
> +	u64 pattern2 = 0;
> +	u64 bad_bits[2] = { 0, 0 };
> +	int kbitno = (octeon_is_cpuid(OCTEON_CN7XXX)) ? 20 : 18;
> +	union cvmx_l2c_ctl l2c_ctl;
> +	int burst;
> +	int saved_dissblkdty;
> +	int node = 0;
> +
> +	// Force full cacheline write-backs to boost traffic
> +	l2c_ctl.u64 = l2c_rd(priv, CVMX_L2C_CTL);
> +	saved_dissblkdty = l2c_ctl.cn78xx.dissblkdty;
> +	l2c_ctl.cn78xx.dissblkdty = 1;
> +	l2c_wr(priv, CVMX_L2C_CTL, l2c_ctl.u64);
> +
> +	if (octeon_is_cpuid(OCTEON_CN73XX) || octeon_is_cpuid(OCTEON_CNF75XX))
> +		kbitno = 18;
> +
> +	// Byte lanes may be clear in the mask to indicate no testing on that
> +	//lane.
> +	datamask = bitmask;
> +
> +	/*
> +	 * Add offset to both test regions to not clobber boot stuff
> +	 * when running from L2 for NAND boot.
> +	 */
> +	p += AREA_BASE_OFFSET;	// make sure base is out of the way of boot
> +
> +	// final address must include LMC and node
> +	p |= (lmc << 7);	/* Map address into proper interface */
> +	p = bdk_numa_get_address(node, p);	/* Map to node */
> +	p |= 1ull << 63;
> +
> +#define II_INC BIT_ULL(22)
> +#define II_MAX BIT_ULL(22)
> +#define K_INC  BIT_ULL(14)
> +#define K_MAX  BIT_ULL(kbitno)
> +#define J_INC  BIT_ULL(9)
> +#define J_MAX  BIT_ULL(12)
> +#define I_INC  BIT_ULL(3)
> +#define I_MAX  BIT_ULL(7)
> +
> +	debug("N%d.LMC%d: %s: phys_addr=0x%llx/0x%llx (0x%llx)\n",
> +	      node, lmc, __func__, p, p + p2offset, 1ULL << kbitno);
> +
> +	// loops are ordered so that only a single 64-bit slot is written to
> +	// each cacheline at one time, then the cachelines are forced out;
> +	// this should maximize read/write traffic
> +
> +	// FIXME? extend the range of memory tested!!
> +	for (ii = 0; ii < II_MAX; ii += II_INC) {
> +		for (i = 0; i < I_MAX; i += I_INC) {
> +			for (k = 0; k < K_MAX; k += K_INC) {
> +				for (j = 0; j < J_MAX; j += J_INC) {
> +					p1 = p + ii + k + j;
> +					p2 = p1 + p2offset;
> +
> +					v = pattern1 * (p1 + i);
> +					// write the same thing to both areas
> +					v1 = v;
> +
> +					cvmx_write64_uint64(p1 + i, v);
> +					cvmx_write64_uint64(p2 + i, v1);
> +
> +					CVMX_CACHE_WBIL2(p1, 0);
> +					CVMX_CACHE_WBIL2(p2, 0);
> +				}
> +			}
> +		}
> +	}
> +
> +	CVMX_DCACHE_INVALIDATE;
> +
> +	debug("N%d.LMC%d: dram_tuning_mem_xor: done INIT loop\n", node, lmc);
> +
> +	/* Make a series of passes over the memory areas. */
> +
> +	for (burst = 0; burst < 1 /* was: dram_tune_use_bursts */ ; burst++) {
> +		u64 this_pattern = cvmx_rng_get_random64();
> +
> +		pattern2 ^= this_pattern;
> +
> +		/*
> +		 * XOR the data with a random value, applying the change to both
> +		 * memory areas.
> +		 */
> +
> +		// FIXME? extend the range of memory tested!!
> +		for (ii = 0; ii < II_MAX; ii += II_INC) {
> +			// FIXME: rearranged, did not make much difference?
> +			for (i = 0; i < I_MAX; i += I_INC) {
> +				for (k = 0; k < K_MAX; k += K_INC) {
> +					for (j = 0; j < J_MAX; j += J_INC) {
> +						p1 = p + ii + k + j;
> +						p2 = p1 + p2offset;
> +
> +						v = cvmx_read64_uint64(p1 +
> +								      i) ^
> +						    this_pattern;
> +						v1 = cvmx_read64_uint64(p2 +
> +								       i) ^
> +						    this_pattern;
> +
> +						cvmx_write64_uint64(p1 + i, v);
> +						cvmx_write64_uint64(p2 + i, v1);
> +
> +						CVMX_CACHE_WBIL2(p1, 0);
> +						CVMX_CACHE_WBIL2(p2, 0);
> +					}
> +				}
> +			}
> +		}
> +
> +		CVMX_DCACHE_INVALIDATE;
> +
> +		debug("N%d.LMC%d: dram_tuning_mem_xor: done MODIFY loop\n",
> +		      node, lmc);
> +
> +		/*
> +		 * Look for differences in the areas. If there is a mismatch,
> +		 * reset both memory locations with the same pattern. Failing
> +		 * to do so means that on all subsequent passes the pair of
> +		 * locations remain out of sync giving spurious errors.
> +		 */
> +
> +		// FIXME: Change the loop order so that an entire cache line
> +		//        is compared at one time. This is so that a read
> +		//        error that occurs *anywhere* on the cacheline will
> +		//        be caught, rather than comparing only 1 cacheline
> +		//        slot at a time, where an error on a different
> +		//        slot will be missed that time around
> +		// Does the above make sense?
> +
> +		// FIXME? extend the range of memory tested!!
> +		for (ii = 0; ii < II_MAX; ii += II_INC) {
> +			for (k = 0; k < K_MAX; k += K_INC) {
> +				for (j = 0; j < J_MAX; j += J_INC) {
> +					p1 = p + ii + k + j;
> +					p2 = p1 + p2offset;
> +
> +					// process entire cachelines in the
> +					//innermost loop
> +					for (i = 0; i < I_MAX; i += I_INC) {
> +						int bybit = 1;
> +						// start in byte lane 0
> +						u64 bymsk = 0xffULL;
> +
> +						// FIXME: this should predict
> +						// what we find...???
> +						v = ((p1 + i) * pattern1) ^
> +							pattern2;
> +						d1 = cvmx_read64_uint64(p1 + i);
> +						d2 = cvmx_read64_uint64(p2 + i);
> +
> +						// union of error bits only in
> +						// active byte lanes
> +						xor = ((d1 ^ v) | (d2 ^ v)) &
> +							datamask;
> +
> +						if (!xor)
> +							continue;
> +
> +						// accumulate bad bits
> +						bad_bits[0] |= xor;
> +
> +						while (xor != 0) {
> +							debug("ERROR(%03d): [0x%016llX] [0x%016llX]  expected 0x%016llX d1 %016llX d2 %016llX\n",
> +							      burst, p1, p2, v,
> +							      d1, d2);
> +							// error(s) in this lane
> +							if (xor & bymsk) {
> +								// set the byte
> +								// error bit
> +								errors |= bybit;
> +								// clear byte
> +								// lane in
> +								// error bits
> +								xor &= ~bymsk;
> +								// clear the
> +								// byte lane in
> +								// the mask
> +								datamask &= ~bymsk;
> +#if EXIT_WHEN_ALL_LANES_HAVE_ERRORS
> +								// nothing
> +								// left to do
> +								if (datamask == 0) {
> +									return errors;
> +								}
> +#endif /* EXIT_WHEN_ALL_LANES_HAVE_ERRORS */
> +							}
> +							// move mask into
> +							// next byte lane
> +							bymsk <<= 8;
> +							// move bit into next
> +							// byte position
> +							bybit <<= 1;
> +						}
> +					}
> +					CVMX_CACHE_WBIL2(p1, 0);
> +					CVMX_CACHE_WBIL2(p2, 0);
> +				}
> +			}
> +		}
> +
> +		debug("N%d.LMC%d: dram_tuning_mem_xor: done TEST loop\n",
> +		      node, lmc);
> +	}
> +
> +	if (xor_data) {		// send the bad bits back...
> +		xor_data[0] = bad_bits[0];
> +		xor_data[1] = bad_bits[1];	// let it be zeroed
> +	}
> +
> +	// Restore original setting that could enable partial cacheline writes
> +	l2c_ctl.u64 = l2c_rd(priv, CVMX_L2C_CTL);
> +	l2c_ctl.cn78xx.dissblkdty = saved_dissblkdty;
> +	l2c_wr(priv, CVMX_L2C_CTL, l2c_ctl.u64);
> +
> +	return errors;
> +}
> +#endif /* USE_ORIG_TEST_DRAM_BYTE */
> +
> +static void ddr4_mrw(struct ddr_priv *priv, int if_num, int rank,
> +		     int mr_wr_addr, int mr_wr_sel, int mr_wr_bg1)
> +{
> +	union cvmx_lmcx_mr_mpr_ctl lmc_mr_mpr_ctl;
> +
> +	lmc_mr_mpr_ctl.u64 = 0;
> +	lmc_mr_mpr_ctl.cn78xx.mr_wr_addr = (mr_wr_addr == -1) ? 0 : mr_wr_addr;
> +	lmc_mr_mpr_ctl.cn78xx.mr_wr_sel = mr_wr_sel;
> +	lmc_mr_mpr_ctl.cn78xx.mr_wr_rank = rank;
> +	lmc_mr_mpr_ctl.cn78xx.mr_wr_use_default_value =
> +		(mr_wr_addr == -1) ? 1 : 0;
> +	lmc_mr_mpr_ctl.cn78xx.mr_wr_bg1 = mr_wr_bg1;
> +	lmc_wr(priv, CVMX_LMCX_MR_MPR_CTL(if_num), lmc_mr_mpr_ctl.u64);
> +
> +	/* Mode Register Write */
> +	oct3_ddr3_seq(priv, 1 << rank, if_num, 0x8);
> +}
> +
> +#define INV_A0_17(x)	((x) ^ 0x22bf8)
> +
> +static void set_mpr_mode(struct ddr_priv *priv, int rank_mask,
> +			 int if_num, int dimm_count, int mpr, int bg1)
> +{
> +	int rankx;
> +
> +	debug("All Ranks: Set mpr mode = %x %c-side\n",
> +	      mpr, (bg1 == 0) ? 'A' : 'B');
> +
> +	for (rankx = 0; rankx < dimm_count * 4; rankx++) {
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +		if (bg1 == 0) {
> +			/* MR3 A-side */
> +			ddr4_mrw(priv, if_num, rankx, mpr << 2, 3, bg1);
> +		} else {
> +			/* MR3 B-side */
> +			ddr4_mrw(priv, if_num, rankx, INV_A0_17(mpr << 2), ~3,
> +				 bg1);
> +		}
> +	}
> +}
> +
> +static void do_ddr4_mpr_read(struct ddr_priv *priv, int if_num,
> +			     int rank, int page, int location)
> +{
> +	union cvmx_lmcx_mr_mpr_ctl lmc_mr_mpr_ctl;
> +
> +	lmc_mr_mpr_ctl.u64 = lmc_rd(priv, CVMX_LMCX_MR_MPR_CTL(if_num));
> +	lmc_mr_mpr_ctl.cn70xx.mr_wr_addr = 0;
> +	lmc_mr_mpr_ctl.cn70xx.mr_wr_sel = page;	/* Page */
> +	lmc_mr_mpr_ctl.cn70xx.mr_wr_rank = rank;
> +	lmc_mr_mpr_ctl.cn70xx.mpr_loc = location;
> +	lmc_mr_mpr_ctl.cn70xx.mpr_wr = 0;	/* Read=0, Write=1 */
> +	lmc_wr(priv, CVMX_LMCX_MR_MPR_CTL(if_num), lmc_mr_mpr_ctl.u64);
> +
> +	/* MPR register access sequence */
> +	oct3_ddr3_seq(priv, 1 << rank, if_num, 0x9);
> +
> +	debug("LMC_MR_MPR_CTL                  : 0x%016llx\n",
> +	      lmc_mr_mpr_ctl.u64);
> +	debug("lmc_mr_mpr_ctl.cn70xx.mr_wr_addr: 0x%02x\n",
> +	      lmc_mr_mpr_ctl.cn70xx.mr_wr_addr);
> +	debug("lmc_mr_mpr_ctl.cn70xx.mr_wr_sel : 0x%02x\n",
> +	      lmc_mr_mpr_ctl.cn70xx.mr_wr_sel);
> +	debug("lmc_mr_mpr_ctl.cn70xx.mpr_loc   : 0x%02x\n",
> +	      lmc_mr_mpr_ctl.cn70xx.mpr_loc);
> +	debug("lmc_mr_mpr_ctl.cn70xx.mpr_wr    : 0x%02x\n",
> +	      lmc_mr_mpr_ctl.cn70xx.mpr_wr);
> +}
> +
> +static int set_rdimm_mode(struct ddr_priv *priv, int if_num, int enable)
> +{
> +	union cvmx_lmcx_control lmc_control;
> +	int save_rdimm_mode;
> +
> +	lmc_control.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +	save_rdimm_mode = lmc_control.s.rdimm_ena;
> +	lmc_control.s.rdimm_ena = enable;
> +	debug("Setting RDIMM_ENA = %x\n", enable);
> +	lmc_wr(priv, CVMX_LMCX_CONTROL(if_num), lmc_control.u64);
> +
> +	return save_rdimm_mode;
> +}
> +
> +static void ddr4_mpr_read(struct ddr_priv *priv, int if_num, int rank,
> +			  int page, int location, u64 *mpr_data)
> +{
> +	do_ddr4_mpr_read(priv, if_num, rank, page, location);
> +
> +	mpr_data[0] = lmc_rd(priv, CVMX_LMCX_MPR_DATA0(if_num));
> +}
> +
> +/* Display MPR values for Page */
> +static void display_mpr_page(struct ddr_priv *priv, int rank_mask,
> +			     int if_num, int page)
> +{
> +	int rankx, location;
> +	u64 mpr_data[3];
> +
> +	for (rankx = 0; rankx < 4; rankx++) {
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +
> +		debug("N0.LMC%d.R%d: MPR Page %d loc [0:3]: ",
> +		      if_num, rankx, page);
> +		for (location = 0; location < 4; location++) {
> +			ddr4_mpr_read(priv, if_num, rankx, page, location,
> +				      mpr_data);
> +			debug("0x%02llx ", mpr_data[0] & 0xFF);
> +		}
> +		debug("\n");
> +
> +	}			/* for (rankx = 0; rankx < 4; rankx++) */
> +}
> +
> +static void ddr4_mpr_write(struct ddr_priv *priv, int if_num, int rank,
> +			   int page, int location, u8 mpr_data)
> +{
> +	union cvmx_lmcx_mr_mpr_ctl lmc_mr_mpr_ctl;
> +
> +	lmc_mr_mpr_ctl.u64 = 0;
> +	lmc_mr_mpr_ctl.cn70xx.mr_wr_addr = mpr_data;
> +	lmc_mr_mpr_ctl.cn70xx.mr_wr_sel = page;	/* Page */
> +	lmc_mr_mpr_ctl.cn70xx.mr_wr_rank = rank;
> +	lmc_mr_mpr_ctl.cn70xx.mpr_loc = location;
> +	lmc_mr_mpr_ctl.cn70xx.mpr_wr = 1;	/* Read=0, Write=1 */
> +	lmc_wr(priv, CVMX_LMCX_MR_MPR_CTL(if_num), lmc_mr_mpr_ctl.u64);
> +
> +	/* MPR register access sequence */
> +	oct3_ddr3_seq(priv, 1 << rank, if_num, 0x9);
> +
> +	debug("LMC_MR_MPR_CTL                  : 0x%016llx\n",
> +	      lmc_mr_mpr_ctl.u64);
> +	debug("lmc_mr_mpr_ctl.cn70xx.mr_wr_addr: 0x%02x\n",
> +	      lmc_mr_mpr_ctl.cn70xx.mr_wr_addr);
> +	debug("lmc_mr_mpr_ctl.cn70xx.mr_wr_sel : 0x%02x\n",
> +	      lmc_mr_mpr_ctl.cn70xx.mr_wr_sel);
> +	debug("lmc_mr_mpr_ctl.cn70xx.mpr_loc   : 0x%02x\n",
> +	      lmc_mr_mpr_ctl.cn70xx.mpr_loc);
> +	debug("lmc_mr_mpr_ctl.cn70xx.mpr_wr    : 0x%02x\n",
> +	      lmc_mr_mpr_ctl.cn70xx.mpr_wr);
> +}
> +
> +static void set_vref(struct ddr_priv *priv, int if_num, int rank,
> +		     int range, int value)
> +{
> +	union cvmx_lmcx_mr_mpr_ctl lmc_mr_mpr_ctl;
> +	union cvmx_lmcx_modereg_params3 lmc_modereg_params3;
> +	int mr_wr_addr = 0;
> +
> +	lmc_mr_mpr_ctl.u64 = 0;
> +	lmc_modereg_params3.u64 = lmc_rd(priv,
> +					 CVMX_LMCX_MODEREG_PARAMS3(if_num));
> +
> +	/* A12:A10 tCCD_L */
> +	mr_wr_addr |= lmc_modereg_params3.s.tccd_l << 10;
> +	mr_wr_addr |= 1 << 7;	/* A7 1 = Enable(Training Mode) */
> +	mr_wr_addr |= range << 6;	/* A6 vrefDQ Training Range */
> +	mr_wr_addr |= value << 0;	/* A5:A0 vrefDQ Training Value */
> +
> +	lmc_mr_mpr_ctl.cn70xx.mr_wr_addr = mr_wr_addr;
> +	lmc_mr_mpr_ctl.cn70xx.mr_wr_sel = 6;	/* Write MR6 */
> +	lmc_mr_mpr_ctl.cn70xx.mr_wr_rank = rank;
> +	lmc_wr(priv, CVMX_LMCX_MR_MPR_CTL(if_num), lmc_mr_mpr_ctl.u64);
> +
> +	/* 0x8 = Mode Register Write */
> +	oct3_ddr3_seq(priv, 1 << rank, if_num, 0x8);
> +
> +	/*
> +	 * It is vendor specific whether vref_value is captured with A7=1.
> +	 * A subsequent MRS might be necessary.
> +	 */
> +	oct3_ddr3_seq(priv, 1 << rank, if_num, 0x8);
> +
> +	mr_wr_addr &= ~(1 << 7);	/* A7 0 = Disable(Training Mode) */
> +	lmc_mr_mpr_ctl.cn70xx.mr_wr_addr = mr_wr_addr;
> +	lmc_wr(priv, CVMX_LMCX_MR_MPR_CTL(if_num), lmc_mr_mpr_ctl.u64);
> +}
> +
> +static void set_dram_output_inversion(struct ddr_priv *priv, int if_num,
> +				      int dimm_count, int rank_mask,
> +				      int inversion)
> +{
> +	union cvmx_lmcx_ddr4_dimm_ctl lmc_ddr4_dimm_ctl;
> +	union cvmx_lmcx_dimmx_params lmc_dimmx_params;
> +	union cvmx_lmcx_dimm_ctl lmc_dimm_ctl;
> +	int dimm_no;
> +
> +	/* Don't touch extenced register control words */
> +	lmc_ddr4_dimm_ctl.u64 = 0;
> +	lmc_wr(priv, CVMX_LMCX_DDR4_DIMM_CTL(if_num), lmc_ddr4_dimm_ctl.u64);
> +
> +	debug("All DIMMs: Register Control Word          RC0 : %x\n",
> +	      (inversion & 1));
> +
> +	for (dimm_no = 0; dimm_no < dimm_count; ++dimm_no) {
> +		lmc_dimmx_params.u64 =
> +		    lmc_rd(priv, CVMX_LMCX_DIMMX_PARAMS(dimm_no, if_num));
> +		lmc_dimmx_params.s.rc0 =
> +		    (lmc_dimmx_params.s.rc0 & ~1) | (inversion & 1);
> +
> +		lmc_wr(priv,
> +		       CVMX_LMCX_DIMMX_PARAMS(dimm_no, if_num),
> +		       lmc_dimmx_params.u64);
> +	}
> +
> +	/* LMC0_DIMM_CTL */
> +	lmc_dimm_ctl.u64 = lmc_rd(priv, CVMX_LMCX_DIMM_CTL(if_num));
> +	lmc_dimm_ctl.s.dimm0_wmask = 0x1;
> +	lmc_dimm_ctl.s.dimm1_wmask = (dimm_count > 1) ? 0x0001 : 0x0000;
> +
> +	debug("LMC DIMM_CTL                                  : 0x%016llx\n",
> +	      lmc_dimm_ctl.u64);
> +	lmc_wr(priv, CVMX_LMCX_DIMM_CTL(if_num), lmc_dimm_ctl.u64);
> +
> +	oct3_ddr3_seq(priv, rank_mask, if_num, 0x7);	/* Init RCW */
> +}
> +
> +static void write_mpr_page0_pattern(struct ddr_priv *priv, int rank_mask,
> +				    int if_num, int dimm_count, int pattern,
> +				    int location_mask)
> +{
> +	int rankx;
> +	int location;
> +
> +	for (rankx = 0; rankx < dimm_count * 4; rankx++) {
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +		for (location = 0; location < 4; ++location) {
> +			if (!(location_mask & (1 << location)))
> +				continue;
> +
> +			ddr4_mpr_write(priv, if_num, rankx,
> +				       /* page */ 0, /* location */ location,
> +				       pattern);
> +		}
> +	}
> +}
> +
> +static void change_rdimm_mpr_pattern(struct ddr_priv *priv, int rank_mask,
> +				     int if_num, int dimm_count)
> +{
> +	int save_ref_zqcs_int;
> +	union cvmx_lmcx_config lmc_config;
> +
> +	/*
> +	 * Okay, here is the latest sequence.  This should work for all
> +	 * chips and passes (78,88,73,etc).  This sequence should be run
> +	 * immediately after DRAM INIT.  The basic idea is to write the
> +	 * same pattern into each of the 4 MPR locations in the DRAM, so
> +	 * that the same value is returned when doing MPR reads regardless
> +	 * of the inversion state.  My advice is to put this into a
> +	 * function, change_rdimm_mpr_pattern or something like that, so
> +	 * that it can be called multiple times, as I think David wants a
> +	 * clock-like pattern for OFFSET training, but does not want a
> +	 * clock pattern for Bit-Deskew.  You should then be able to call
> +	 * this at any point in the init sequence (after DRAM init) to
> +	 * change the pattern to a new value.
> +	 * Mike
> +	 *
> +	 * A correction: PHY doesn't need any pattern during offset
> +	 * training, but needs clock like pattern for internal vref and
> +	 * bit-dskew training.  So for that reason, these steps below have
> +	 * to be conducted before those trainings to pre-condition
> +	 * the pattern.  David
> +	 *
> +	 * Note: Step 3, 4, 8 and 9 have to be done through RDIMM
> +	 * sequence. If you issue MRW sequence to do RCW write (in o78 pass
> +	 * 1 at least), LMC will still do two commands because
> +	 * CONTROL[RDIMM_ENA] is still set high. We don't want it to have
> +	 * any unintentional mode register write so it's best to do what
> +	 * Mike is doing here.
> +	 * Andrew
> +	 */
> +
> +	/* 1) Disable refresh (REF_ZQCS_INT = 0) */
> +
> +	debug("1) Disable refresh (REF_ZQCS_INT = 0)\n");
> +
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	save_ref_zqcs_int = lmc_config.cn78xx.ref_zqcs_int;
> +	lmc_config.cn78xx.ref_zqcs_int = 0;
> +	lmc_wr(priv, CVMX_LMCX_CONFIG(if_num), lmc_config.u64);
> +
> +	/*
> +	 * 2) Put all devices in MPR mode (Run MRW sequence (sequence=8)
> +	 * with MODEREG_PARAMS0[MPRLOC]=0,
> +	 * MODEREG_PARAMS0[MPR]=1, MR_MPR_CTL[MR_WR_SEL]=3, and
> +	 * MR_MPR_CTL[MR_WR_USE_DEFAULT_VALUE]=1)
> +	 */
> +
> +	debug("2) Put all devices in MPR mode (Run MRW sequence (sequence=8)\n");
> +
> +	/* A-side */
> +	set_mpr_mode(priv, rank_mask, if_num, dimm_count, 1, 0);
> +	/* B-side */
> +	set_mpr_mode(priv, rank_mask, if_num, dimm_count, 1, 1);
> +
> +	/*
> +	 * a. Or you can set MR_MPR_CTL[MR_WR_USE_DEFAULT_VALUE]=0 and set
> +	 * the value you would like directly into
> +	 * MR_MPR_CTL[MR_WR_ADDR]
> +	 */
> +
> +	/*
> +	 * 3) Disable RCD Parity (if previously enabled) - parity does not
> +	 * work if inversion disabled
> +	 */
> +
> +	debug("3) Disable RCD Parity\n");
> +
> +	/*
> +	 * 4) Disable Inversion in the RCD.
> +	 * a. I did (3&4) via the RDIMM sequence (seq_sel=7), but it
> +	 * may be easier to use the MRW sequence (seq_sel=8).  Just set
> +	 * MR_MPR_CTL[MR_WR_SEL]=7, MR_MPR_CTL[MR_WR_ADDR][3:0]=data,
> +	 * MR_MPR_CTL[MR_WR_ADDR][7:4]=RCD reg
> +	 */
> +
> +	debug("4) Disable Inversion in the RCD.\n");
> +
> +	set_dram_output_inversion(priv, if_num, dimm_count, rank_mask, 1);
> +
> +	/*
> +	 * 5) Disable CONTROL[RDIMM_ENA] so that MR sequence goes out
> +	 * non-inverted.
> +	 */
> +
> +	debug("5) Disable CONTROL[RDIMM_ENA]\n");
> +
> +	set_rdimm_mode(priv, if_num, 0);
> +
> +	/*
> +	 * 6) Write all 4 MPR registers with the desired pattern (have to
> +	 * do this for all enabled ranks)
> +	 * a. MR_MPR_CTL.MPR_WR=1, MR_MPR_CTL.MPR_LOC=0..3,
> +	 * MR_MPR_CTL.MR_WR_SEL=0, MR_MPR_CTL.MR_WR_ADDR[7:0]=pattern
> +	 */
> +
> +	debug("6) Write all 4 MPR page 0 Training Patterns\n");
> +
> +	write_mpr_page0_pattern(priv, rank_mask, if_num, dimm_count, 0x55, 0x8);
> +
> +	/* 7) Re-enable RDIMM_ENA */
> +
> +	debug("7) Re-enable RDIMM_ENA\n");
> +
> +	set_rdimm_mode(priv, if_num, 1);
> +
> +	/* 8) Re-enable RDIMM inversion */
> +
> +	debug("8) Re-enable RDIMM inversion\n");
> +
> +	set_dram_output_inversion(priv, if_num, dimm_count, rank_mask, 0);
> +
> +	/* 9) Re-enable RDIMM parity (if desired) */
> +
> +	debug("9) Re-enable RDIMM parity (if desired)\n");
> +
> +	/*
> +	 * 10)Take B-side devices out of MPR mode (Run MRW sequence
> +	 * (sequence=8) with MODEREG_PARAMS0[MPRLOC]=0,
> +	 * MODEREG_PARAMS0[MPR]=0, MR_MPR_CTL[MR_WR_SEL]=3, and
> +	 * MR_MPR_CTL[MR_WR_USE_DEFAULT_VALUE]=1)
> +	 */
> +
> +	debug("10)Take B-side devices out of MPR mode\n");
> +
> +	set_mpr_mode(priv, rank_mask, if_num, dimm_count,
> +		     /* mpr */ 0, /* bg1 */ 1);
> +
> +	/*
> +	 * a. Or you can set MR_MPR_CTL[MR_WR_USE_DEFAULT_VALUE]=0 and
> +	 * set the value you would like directly into MR_MPR_CTL[MR_WR_ADDR]
> +	 */
> +
> +	/* 11)Re-enable refresh (REF_ZQCS_INT=previous value) */
> +
> +	debug("11)Re-enable refresh (REF_ZQCS_INT=previous value)\n");
> +
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	lmc_config.cn78xx.ref_zqcs_int = save_ref_zqcs_int;
> +	lmc_wr(priv, CVMX_LMCX_CONFIG(if_num), lmc_config.u64);
> +}
> +
> +static int validate_hwl_seq(int *wl, int *seq)
> +{
> +	// sequence index, step through the sequence array
> +	int seqx;
> +	int bitnum;
> +
> +	seqx = 0;
> +
> +	while (seq[seqx + 1] >= 0) {	// stop on next seq entry == -1
> +		// but now, check current versus next
> +		bitnum = (wl[seq[seqx]] << 2) | wl[seq[seqx + 1]];
> +		// magic validity number (see matrix above)
> +		if (!((1 << bitnum) & 0xBDE7))
> +			return 1;
> +		seqx++;
> +	}
> +
> +	return 0;
> +}
> +
> +static int validate_hw_wl_settings(int if_num,
> +				   union cvmx_lmcx_wlevel_rankx
> +				   *lmc_wlevel_rank, int is_rdimm, int ecc_ena)
> +{
> +	int wl[9], byte, errors;
> +
> +	// arrange the sequences so
> +	// index 0 has byte 0, etc, ECC in middle
> +	int useq[] = { 0, 1, 2, 3, 8, 4, 5, 6, 7, -1 };
> +	// index 0 is ECC, then go down
> +	int rseq1[] = { 8, 3, 2, 1, 0, -1 };
> +	// index 0 has byte 4, then go up
> +	int rseq2[] = { 4, 5, 6, 7, -1 };
> +	// index 0 has byte 0, etc, no ECC
> +	int useqno[] = { 0, 1, 2, 3, 4, 5, 6, 7, -1 };
> +	// index 0 is byte 3, then go down, no ECC
> +	int rseq1no[] = { 3, 2, 1, 0, -1 };
> +
> +	// in the CSR, bytes 0-7 are always data, byte 8 is ECC
> +	for (byte = 0; byte < (8 + ecc_ena); byte++) {
> +		// preprocess :-)
> +		wl[byte] = (get_wl_rank(lmc_wlevel_rank, byte) >>
> +			    1) & 3;
> +	}
> +
> +	errors = 0;
> +	if (is_rdimm) {		// RDIMM order
> +		errors = validate_hwl_seq(wl, (ecc_ena) ? rseq1 : rseq1no);
> +		errors += validate_hwl_seq(wl, rseq2);
> +	} else {		// UDIMM order
> +		errors = validate_hwl_seq(wl, (ecc_ena) ? useq : useqno);
> +	}
> +
> +	return errors;
> +}
> +
> +static unsigned int extr_wr(u64 u, int x)
> +{
> +	return (unsigned int)(((u >> (x * 12 + 5)) & 0x3ULL) |
> +			      ((u >> (51 + x - 2)) & 0x4ULL));
> +}
> +
> +static void insrt_wr(u64 *up, int x, int v)
> +{
> +	u64 u = *up;
> +
> +	u &= ~(((0x3ULL) << (x * 12 + 5)) | ((0x1ULL) << (51 + x)));
> +	*up = (u | ((v & 0x3ULL) << (x * 12 + 5)) |
> +	       ((v & 0x4ULL) << (51 + x - 2)));
> +}
> +
> +/* Read out Deskew Settings for DDR */
> +
> +struct deskew_bytes {
> +	u16 bits[8];
> +};
> +
> +struct deskew_data {
> +	struct deskew_bytes bytes[9];
> +};
> +
> +struct dac_data {
> +	int bytes[9];
> +};
> +
> +// T88 pass 1, skip 4=DAC
> +static const u8 dsk_bit_seq_p1[8] = { 0, 1, 2, 3, 5, 6, 7, 8 };
> +// T88 Pass 2, skip 4=DAC and 5=DBI
> +static const u8 dsk_bit_seq_p2[8] = { 0, 1, 2, 3, 6, 7, 8, 9 };
> +
> +static void get_deskew_settings(struct ddr_priv *priv, int if_num,
> +				struct deskew_data *dskdat)
> +{
> +	union cvmx_lmcx_phy_ctl phy_ctl;
> +	union cvmx_lmcx_config lmc_config;
> +	int bit_index;
> +	int byte_lane, byte_limit;
> +	// NOTE: these are for pass 2.x
> +	int is_o78p2 = !octeon_is_cpuid(OCTEON_CN78XX_PASS1_X);
> +	const u8 *bit_seq = (is_o78p2) ? dsk_bit_seq_p2 : dsk_bit_seq_p1;
> +
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	byte_limit = ((!lmc_config.s.mode32b) ? 8 : 4) + lmc_config.s.ecc_ena;
> +
> +	memset(dskdat, 0, sizeof(*dskdat));
> +
> +	phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +	phy_ctl.s.dsk_dbg_clk_scaler = 3;
> +
> +	for (byte_lane = 0; byte_lane < byte_limit; byte_lane++) {
> +		phy_ctl.s.dsk_dbg_byte_sel = byte_lane;	// set byte lane
> +
> +		for (bit_index = 0; bit_index < 8; ++bit_index) {
> +			// set bit number and start read sequence
> +			phy_ctl.s.dsk_dbg_bit_sel = bit_seq[bit_index];
> +			phy_ctl.s.dsk_dbg_rd_start = 1;
> +			lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +
> +			// poll for read sequence to complete
> +			do {
> +				phy_ctl.u64 =
> +					lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +			} while (phy_ctl.s.dsk_dbg_rd_complete != 1);
> +
> +			// record the data
> +			dskdat->bytes[byte_lane].bits[bit_index] =
> +				phy_ctl.s.dsk_dbg_rd_data & 0x3ff;
> +		}
> +	}
> +}
> +
> +static void display_deskew_settings(struct ddr_priv *priv, int if_num,
> +				    struct deskew_data *dskdat,
> +				    int print_enable)
> +{
> +	int byte_lane;
> +	int bit_num;
> +	u16 flags, deskew;
> +	union cvmx_lmcx_config lmc_config;
> +	int byte_limit;
> +	const char *fc = " ?-=+*#&";
> +
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	byte_limit = ((lmc_config.s.mode32b) ? 4 : 8) + lmc_config.s.ecc_ena;
> +
> +	if (print_enable) {
> +		debug("N0.LMC%d: Deskew Data:              Bit =>      :",
> +		      if_num);
> +		for (bit_num = 7; bit_num >= 0; --bit_num)
> +			debug(" %3d  ", bit_num);
> +		debug("\n");
> +	}
> +
> +	for (byte_lane = 0; byte_lane < byte_limit; byte_lane++) {
> +		if (print_enable)
> +			debug("N0.LMC%d: Bit Deskew Byte %d %s               :",
> +			      if_num, byte_lane,
> +			      (print_enable >= 3) ? "FINAL" : "     ");
> +
> +		for (bit_num = 7; bit_num >= 0; --bit_num) {
> +			flags = dskdat->bytes[byte_lane].bits[bit_num] & 7;
> +			deskew = dskdat->bytes[byte_lane].bits[bit_num] >> 3;
> +
> +			if (print_enable)
> +				debug(" %3d %c", deskew, fc[flags ^ 1]);
> +
> +		}		/* for (bit_num = 7; bit_num >= 0; --bit_num) */
> +
> +		if (print_enable)
> +			debug("\n");
> +	}
> +}
> +
> +#if ALLOW_BY_RANK_INIT
> +static void override_deskew_settings(struct ddr_priv *priv, int if_num,
> +				     struct deskew_data *dskdat)
> +{
> +	union cvmx_lmcx_phy_ctl phy_ctl;
> +	union cvmx_lmcx_config lmc_config;
> +
> +	int bit, byte_lane, byte_limit;
> +	u64 csr_data;
> +
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	byte_limit = ((lmc_config.s.mode32b) ? 4 : 8) + lmc_config.s.ecc_ena;
> +
> +	phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +
> +	phy_ctl.s.phy_reset = 0;
> +	phy_ctl.s.dsk_dbg_num_bits_sel = 1;
> +	phy_ctl.s.dsk_dbg_offset = 0;
> +	phy_ctl.s.dsk_dbg_clk_scaler = 3;
> +
> +	phy_ctl.s.dsk_dbg_wr_mode = 1;
> +	phy_ctl.s.dsk_dbg_load_dis = 0;
> +	phy_ctl.s.dsk_dbg_overwrt_ena = 0;
> +
> +	phy_ctl.s.phy_dsk_reset = 0;
> +
> +	lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +	lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +
> +	for (byte_lane = 0; byte_lane < byte_limit; byte_lane++) {
> +		csr_data = 0;
> +		// FIXME: can we ignore DBI?
> +		for (bit = 0; bit < 8; ++bit) {
> +			// fetch input and adjust
> +			u64 bits = (dskdat->bytes[byte_lane].bits[bit] >> 3) &
> +				0x7F;
> +
> +			/*
> +			 * lmc_general_purpose0.data[6:0]    // DQ0
> +			 * lmc_general_purpose0.data[13:7]   // DQ1
> +			 * lmc_general_purpose0.data[20:14]  // DQ2
> +			 * lmc_general_purpose0.data[27:21]  // DQ3
> +			 * lmc_general_purpose0.data[34:28]  // DQ4
> +			 * lmc_general_purpose0.data[41:35]  // DQ5
> +			 * lmc_general_purpose0.data[48:42]  // DQ6
> +			 * lmc_general_purpose0.data[55:49]  // DQ7
> +			 * lmc_general_purpose0.data[62:56]  // DBI
> +			 */
> +			csr_data |= (bits << (7 * bit));
> +
> +		} /* for (bit = 0; bit < 8; ++bit) */
> +
> +		// update GP0 with the bit data for this byte lane
> +		lmc_wr(priv, CVMX_LMCX_GENERAL_PURPOSE0(if_num), csr_data);
> +		lmc_rd(priv, CVMX_LMCX_GENERAL_PURPOSE0(if_num));
> +
> +		// start the deskew load sequence
> +		phy_ctl.s.dsk_dbg_byte_sel = byte_lane;
> +		phy_ctl.s.dsk_dbg_rd_start = 1;
> +		lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +
> +		// poll for read sequence to complete
> +		do {
> +			udelay(100);
> +			phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +		} while (phy_ctl.s.dsk_dbg_rd_complete != 1);
> +	}
> +
> +	// tell phy to use the new settings
> +	phy_ctl.s.dsk_dbg_overwrt_ena = 1;
> +	phy_ctl.s.dsk_dbg_rd_start = 0;
> +	lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +
> +	phy_ctl.s.dsk_dbg_wr_mode = 0;
> +	lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +}
> +
> +static void process_by_rank_dac(struct ddr_priv *priv, int if_num,
> +				int rank_mask, struct dac_data *dacdat)
> +{
> +	union cvmx_lmcx_config lmc_config;
> +	int rankx, byte_lane;
> +	int byte_limit;
> +	int rank_count;
> +	struct dac_data dacsum;
> +	int lane_probs;
> +
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	byte_limit = ((lmc_config.s.mode32b) ? 4 : 8) + lmc_config.s.ecc_ena;
> +
> +	memset((void *)&dacsum, 0, sizeof(dacsum));
> +	rank_count = 0;
> +	lane_probs = 0;
> +
> +	for (rankx = 0; rankx < 4; rankx++) {
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +		rank_count++;
> +
> +		display_dac_dbi_settings(if_num, /*dac */ 1,
> +					 lmc_config.s.ecc_ena,
> +					 &dacdat[rankx].bytes[0],
> +					 "By-Ranks VREF");
> +		// sum
> +		for (byte_lane = 0; byte_lane < byte_limit; byte_lane++) {
> +			if (rank_count == 2) {
> +				int ranks_diff =
> +				    abs((dacsum.bytes[byte_lane] -
> +					 dacdat[rankx].bytes[byte_lane]));
> +
> +				// FIXME: is 19 a good number?
> +				if (ranks_diff > 19)
> +					lane_probs |= (1 << byte_lane);
> +			}
> +			dacsum.bytes[byte_lane] +=
> +			    dacdat[rankx].bytes[byte_lane];
> +		}
> +	}
> +
> +	// average
> +	for (byte_lane = 0; byte_lane < byte_limit; byte_lane++)
> +		dacsum.bytes[byte_lane] /= rank_count;	// FIXME: nint?
> +
> +	display_dac_dbi_settings(if_num, /*dac */ 1, lmc_config.s.ecc_ena,
> +				 &dacsum.bytes[0], "All-Rank VREF");
> +
> +	if (lane_probs) {
> +		debug("N0.LMC%d: All-Rank VREF DAC Problem Bytelane(s): 0x%03x\n",
> +		      if_num, lane_probs);
> +	}
> +
> +	// finally, write the averaged DAC values
> +	for (byte_lane = 0; byte_lane < byte_limit; byte_lane++) {
> +		load_dac_override(priv, if_num, dacsum.bytes[byte_lane],
> +				  byte_lane);
> +	}
> +}
> +
> +static void process_by_rank_dsk(struct ddr_priv *priv, int if_num,
> +				int rank_mask, struct deskew_data *dskdat)
> +{
> +	union cvmx_lmcx_config lmc_config;
> +	int rankx, lane, bit;
> +	int byte_limit;
> +	struct deskew_data dsksum, dskcnt;
> +	u16 deskew;
> +
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	byte_limit = ((lmc_config.s.mode32b) ? 4 : 8) + lmc_config.s.ecc_ena;
> +
> +	memset((void *)&dsksum, 0, sizeof(dsksum));
> +	memset((void *)&dskcnt, 0, sizeof(dskcnt));
> +
> +	for (rankx = 0; rankx < 4; rankx++) {
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +
> +		// sum ranks
> +		for (lane = 0; lane < byte_limit; lane++) {
> +			for (bit = 0; bit < 8; ++bit) {
> +				deskew = dskdat[rankx].bytes[lane].bits[bit];
> +				// if flags indicate sat hi or lo, skip it
> +				if (deskew & 6)
> +					continue;
> +
> +				// clear flags
> +				dsksum.bytes[lane].bits[bit] +=
> +					deskew & ~7;
> +				// count entries
> +				dskcnt.bytes[lane].bits[bit] += 1;
> +			}
> +		}
> +	}
> +
> +	// average ranks
> +	for (lane = 0; lane < byte_limit; lane++) {
> +		for (bit = 0; bit < 8; ++bit) {
> +			int div = dskcnt.bytes[lane].bits[bit];
> +
> +			if (div > 0) {
> +				dsksum.bytes[lane].bits[bit] /= div;
> +				// clear flags
> +				dsksum.bytes[lane].bits[bit] &= ~7;
> +				// set LOCK
> +				dsksum.bytes[lane].bits[bit] |= 1;
> +			} else {
> +				// FIXME? use reset value?
> +				dsksum.bytes[lane].bits[bit] =
> +					(64 << 3) | 1;
> +			}
> +		}
> +	}
> +
> +	// TME for FINAL version
> +	display_deskew_settings(priv, if_num, &dsksum, /*VBL_TME */ 3);
> +
> +	// finally, write the averaged DESKEW values
> +	override_deskew_settings(priv, if_num, &dsksum);
> +}
> +#endif /* ALLOW_BY_RANK_INIT */
> +
> +struct deskew_counts {
> +	int saturated;		// number saturated
> +	int unlocked;		// number unlocked
> +	int nibrng_errs;	// nibble range errors
> +	int nibunl_errs;	// nibble unlocked errors
> +	int bitval_errs;	// bit value errors
> +};
> +
> +#define MIN_BITVAL  17
> +#define MAX_BITVAL 110
> +
> +static void validate_deskew_training(struct ddr_priv *priv, int rank_mask,
> +				     int if_num, struct deskew_counts *counts,
> +				     int print_flags)
> +{
> +	int byte_lane, bit_index, nib_num;
> +	int nibrng_errs, nibunl_errs, bitval_errs;
> +	union cvmx_lmcx_config lmc_config;
> +	s16 nib_min[2], nib_max[2], nib_unl[2];
> +	int byte_limit;
> +	int print_enable = print_flags & 1;
> +	struct deskew_data dskdat;
> +	s16 flags, deskew;
> +	const char *fc = " ?-=+*#&";
> +	int bit_last;
> +
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	byte_limit = ((!lmc_config.s.mode32b) ? 8 : 4) + lmc_config.s.ecc_ena;
> +
> +	memset(counts, 0, sizeof(struct deskew_counts));
> +
> +	get_deskew_settings(priv, if_num, &dskdat);
> +
> +	if (print_enable) {
> +		debug("N0.LMC%d: Deskew Settings:          Bit =>      :",
> +		      if_num);
> +		for (bit_index = 7; bit_index >= 0; --bit_index)
> +			debug(" %3d  ", bit_index);
> +		debug("\n");
> +	}
> +
> +	for (byte_lane = 0; byte_lane < byte_limit; byte_lane++) {
> +		if (print_enable)
> +			debug("N0.LMC%d: Bit Deskew Byte %d %s               :",
> +			      if_num, byte_lane,
> +			      (print_flags & 2) ? "FINAL" : "     ");
> +
> +		nib_min[0] = 127;
> +		nib_min[1] = 127;
> +		nib_max[0] = 0;
> +		nib_max[1] = 0;
> +		nib_unl[0] = 0;
> +		nib_unl[1] = 0;
> +
> +		if (lmc_config.s.mode32b == 1 && byte_lane == 4) {
> +			bit_last = 3;
> +			if (print_enable)
> +				debug("                        ");
> +		} else {
> +			bit_last = 7;
> +		}
> +
> +		for (bit_index = bit_last; bit_index >= 0; --bit_index) {
> +			nib_num = (bit_index > 3) ? 1 : 0;
> +
> +			flags = dskdat.bytes[byte_lane].bits[bit_index] & 7;
> +			deskew = dskdat.bytes[byte_lane].bits[bit_index] >> 3;
> +
> +			counts->saturated += !!(flags & 6);
> +
> +			// Do range calc even when locked; it could happen
> +			// that a bit is still unlocked after final retry,
> +			// and we want to have an external retry if a RANGE
> +			// error is present at exit...
> +			nib_min[nib_num] = min(nib_min[nib_num], deskew);
> +			nib_max[nib_num] = max(nib_max[nib_num], deskew);
> +
> +			if (!(flags & 1)) {	// only when not locked
> +				counts->unlocked += 1;
> +				nib_unl[nib_num] += 1;
> +			}
> +
> +			if (print_enable)
> +				debug(" %3d %c", deskew, fc[flags ^ 1]);
> +		}
> +
> +		/*
> +		 * Now look for nibble errors
> +		 *
> +		 * For bit 55, it looks like a bit deskew problem. When the
> +		 * upper nibble of byte 6 needs to go to saturation, bit 7
> +		 * of byte 6 locks prematurely at 64. For DIMMs with raw
> +		 * card A and B, can we reset the deskew training when we
> +		 * encounter this case? The reset criteria should be looking
> +		 * at one nibble at a time for raw card A and B; if the
> +		 * bit-deskew setting within a nibble is different by > 33,
> +		 * we'll issue a reset to the bit deskew training.
> +		 *
> +		 * LMC0 Bit Deskew Byte(6): 64 0 - 0 - 0 - 26 61 35 64
> +		 */
> +		// upper nibble range, then lower nibble range
> +		nibrng_errs = ((nib_max[1] - nib_min[1]) > 33) ? 1 : 0;
> +		nibrng_errs |= ((nib_max[0] - nib_min[0]) > 33) ? 1 : 0;
> +
> +		// check for nibble all unlocked
> +		nibunl_errs = ((nib_unl[0] == 4) || (nib_unl[1] == 4)) ? 1 : 0;
> +
> +		// check for bit value errors, ie < 17 or > 110
> +		// FIXME? assume max always > MIN_BITVAL and min < MAX_BITVAL
> +		bitval_errs = ((nib_max[1] > MAX_BITVAL) ||
> +			       (nib_max[0] > MAX_BITVAL)) ? 1 : 0;
> +		bitval_errs |= ((nib_min[1] < MIN_BITVAL) ||
> +				(nib_min[0] < MIN_BITVAL)) ? 1 : 0;
> +
> +		if ((nibrng_errs != 0 || nibunl_errs != 0 ||
> +		     bitval_errs != 0) && print_enable) {
> +			debug(" %c%c%c",
> +			      (nibrng_errs) ? 'R' : ' ',
> +			      (nibunl_errs) ? 'U' : ' ',
> +			      (bitval_errs) ? 'V' : ' ');
> +		}
> +
> +		if (print_enable)
> +			debug("\n");
> +
> +		counts->nibrng_errs |= (nibrng_errs << byte_lane);
> +		counts->nibunl_errs |= (nibunl_errs << byte_lane);
> +		counts->bitval_errs |= (bitval_errs << byte_lane);
> +	}
> +}
> +
> +static unsigned short load_dac_override(struct ddr_priv *priv, int if_num,
> +					int dac_value, int byte)
> +{
> +	union cvmx_lmcx_dll_ctl3 ddr_dll_ctl3;
> +	// single bytelanes incr by 1; A is for ALL
> +	int bytex = (byte == 0x0A) ? byte : byte + 1;
> +
> +	ddr_dll_ctl3.u64 = lmc_rd(priv, CVMX_LMCX_DLL_CTL3(if_num));
> +
> +	SET_DDR_DLL_CTL3(byte_sel, bytex);
> +	SET_DDR_DLL_CTL3(offset, dac_value >> 1);
> +
> +	ddr_dll_ctl3.cn73xx.bit_select = 0x9;	/* No-op */
> +	lmc_wr(priv, CVMX_LMCX_DLL_CTL3(if_num), ddr_dll_ctl3.u64);
> +
> +	ddr_dll_ctl3.cn73xx.bit_select = 0xC;	/* vref bypass setting load */
> +	lmc_wr(priv, CVMX_LMCX_DLL_CTL3(if_num), ddr_dll_ctl3.u64);
> +
> +	ddr_dll_ctl3.cn73xx.bit_select = 0xD;	/* vref bypass on. */
> +	lmc_wr(priv, CVMX_LMCX_DLL_CTL3(if_num), ddr_dll_ctl3.u64);
> +
> +	ddr_dll_ctl3.cn73xx.bit_select = 0x9;	/* No-op */
> +	lmc_wr(priv, CVMX_LMCX_DLL_CTL3(if_num), ddr_dll_ctl3.u64);
> +
> +	lmc_rd(priv, CVMX_LMCX_DLL_CTL3(if_num));	// flush writes
> +
> +	return (unsigned short)GET_DDR_DLL_CTL3(offset);
> +}
> +
> +// arg dac_or_dbi is 1 for DAC, 0 for DBI
> +// returns 9 entries (bytelanes 0 through 8) in settings[]
> +// returns 0 if OK, -1 if a problem
> +static int read_dac_dbi_settings(struct ddr_priv *priv, int if_num,
> +				 int dac_or_dbi, int *settings)
> +{
> +	union cvmx_lmcx_phy_ctl phy_ctl;
> +	int byte_lane, bit_num;
> +	int deskew;
> +	int dac_value;
> +	int new_deskew_layout = 0;
> +
> +	new_deskew_layout = octeon_is_cpuid(OCTEON_CN73XX) ||
> +		octeon_is_cpuid(OCTEON_CNF75XX);
> +	new_deskew_layout |= (octeon_is_cpuid(OCTEON_CN78XX) &&
> +			      !octeon_is_cpuid(OCTEON_CN78XX_PASS1_X));
> +
> +	phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +	phy_ctl.s.dsk_dbg_clk_scaler = 3;
> +	lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +
> +	bit_num = (dac_or_dbi) ? 4 : 5;
> +	// DBI not available
> +	if (bit_num == 5 && !new_deskew_layout)
> +		return -1;
> +
> +	// FIXME: always assume ECC is available
> +	for (byte_lane = 8; byte_lane >= 0; --byte_lane) {
> +		//set byte lane and bit to read
> +		phy_ctl.s.dsk_dbg_bit_sel = bit_num;
> +		phy_ctl.s.dsk_dbg_byte_sel = byte_lane;
> +		lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +
> +		//start read sequence
> +		phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +		phy_ctl.s.dsk_dbg_rd_start = 1;
> +		lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +
> +		//poll for read sequence to complete
> +		do {
> +			phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +		} while (phy_ctl.s.dsk_dbg_rd_complete != 1);
> +
> +		// keep the flag bits where they are for DBI
> +		deskew = phy_ctl.s.dsk_dbg_rd_data; /* >> 3 */
> +		dac_value = phy_ctl.s.dsk_dbg_rd_data & 0xff;
> +
> +		settings[byte_lane] = (dac_or_dbi) ? dac_value : deskew;
> +	}
> +
> +	return 0;
> +}
> +
> +// print out the DBI settings array
> +// arg dac_or_dbi is 1 for DAC, 0 for DBI
> +static void display_dac_dbi_settings(int lmc, int dac_or_dbi,
> +				     int ecc_ena, int *settings, char *title)
> +{
> +	int byte;
> +	int flags;
> +	int deskew;
> +	const char *fc = " ?-=+*#&";
> +
> +	debug("N0.LMC%d: %s %s Settings %d:0 :",
> +	      lmc, title, (dac_or_dbi) ? "DAC" : "DBI", 7 + ecc_ena);
> +	// FIXME: what about 32-bit mode?
> +	for (byte = (7 + ecc_ena); byte >= 0; --byte) {
> +		if (dac_or_dbi) {	// DAC
> +			flags = 1;	// say its locked to get blank
> +			deskew = settings[byte] & 0xff;
> +		} else {	// DBI
> +			flags = settings[byte] & 7;
> +			deskew = (settings[byte] >> 3) & 0x7f;
> +		}
> +		debug(" %3d %c", deskew, fc[flags ^ 1]);
> +	}
> +	debug("\n");
> +}
> +
> +// Find a HWL majority
> +static int find_wl_majority(struct wlevel_bitcnt *bc, int *mx, int *mc,
> +			    int *xc, int *cc)
> +{
> +	int ix, ic;
> +
> +	*mx = -1;
> +	*mc = 0;
> +	*xc = 0;
> +	*cc = 0;
> +
> +	for (ix = 0; ix < 4; ix++) {
> +		ic = bc->bitcnt[ix];
> +
> +		// make a bitmask of the ones with a count
> +		if (ic > 0) {
> +			*mc |= (1 << ix);
> +			*cc += 1;	// count how many had non-zero counts
> +		}
> +
> +		// find the majority
> +		if (ic > *xc) {	// new max?
> +			*xc = ic;	// yes
> +			*mx = ix;	// set its index
> +		}
> +	}
> +
> +	return (*mx << 1);
> +}
> +
> +// Evaluate the DAC settings array
> +static int evaluate_dac_settings(int if_64b, int ecc_ena, int *settings)
> +{
> +	int byte, lane, dac, comp;
> +	int last = (if_64b) ? 7 : 3;
> +
> +	// FIXME: change the check...???
> +	// this looks only for sets of DAC values whose max/min differ by a lot
> +	// let any EVEN go so long as it is within range...
> +	for (byte = (last + ecc_ena); byte >= 0; --byte) {
> +		dac = settings[byte] & 0xff;
> +
> +		for (lane = (last + ecc_ena); lane >= 0; --lane) {
> +			comp = settings[lane] & 0xff;
> +			if (abs((dac - comp)) > 25)
> +				return 1;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void perform_offset_training(struct ddr_priv *priv, int rank_mask,
> +				    int if_num)
> +{
> +	union cvmx_lmcx_phy_ctl lmc_phy_ctl;
> +	u64 orig_phy_ctl;
> +	const char *s;
> +
> +	/*
> +	 * 4.8.6 LMC Offset Training
> +	 *
> +	 * LMC requires input-receiver offset training.
> +	 *
> +	 * 1. Write LMC(0)_PHY_CTL[DAC_ON] = 1
> +	 */
> +	lmc_phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +	orig_phy_ctl = lmc_phy_ctl.u64;
> +	lmc_phy_ctl.s.dac_on = 1;
> +
> +	// allow full CSR override
> +	s = lookup_env_ull(priv, "ddr_phy_ctl");
> +	if (s)
> +		lmc_phy_ctl.u64 = strtoull(s, NULL, 0);
> +
> +	// do not print or write if CSR does not change...
> +	if (lmc_phy_ctl.u64 != orig_phy_ctl) {
> +		debug("PHY_CTL                                       : 0x%016llx\n",
> +		      lmc_phy_ctl.u64);
> +		lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), lmc_phy_ctl.u64);
> +	}
> +
> +	/*
> +	 * 2. Write LMC(0)_SEQ_CTL[SEQ_SEL] = 0x0B and
> +	 *    LMC(0)_SEQ_CTL[INIT_START] = 1.
> +	 *
> +	 * 3. Wait for LMC(0)_SEQ_CTL[SEQ_COMPLETE] to be set to 1.
> +	 */
> +	/* Start Offset training sequence */
> +	oct3_ddr3_seq(priv, rank_mask, if_num, 0x0B);
> +}
> +
> +static void perform_internal_vref_training(struct ddr_priv *priv,
> +					   int rank_mask, int if_num)
> +{
> +	union cvmx_lmcx_ext_config ext_config;
> +	union cvmx_lmcx_dll_ctl3 ddr_dll_ctl3;
> +
> +	// First, make sure all byte-lanes are out of VREF bypass mode
> +	ddr_dll_ctl3.u64 = lmc_rd(priv, CVMX_LMCX_DLL_CTL3(if_num));
> +
> +	ddr_dll_ctl3.cn78xx.byte_sel = 0x0A;	/* all byte-lanes */
> +	ddr_dll_ctl3.cn78xx.bit_select = 0x09;	/* No-op */
> +	lmc_wr(priv, CVMX_LMCX_DLL_CTL3(if_num), ddr_dll_ctl3.u64);
> +
> +	ddr_dll_ctl3.cn78xx.bit_select = 0x0E;	/* vref bypass off. */
> +	lmc_wr(priv, CVMX_LMCX_DLL_CTL3(if_num), ddr_dll_ctl3.u64);
> +
> +	ddr_dll_ctl3.cn78xx.bit_select = 0x09;	/* No-op */
> +	lmc_wr(priv, CVMX_LMCX_DLL_CTL3(if_num), ddr_dll_ctl3.u64);
> +
> +	/*
> +	 * 4.8.7 LMC Internal vref Training
> +	 *
> +	 * LMC requires input-reference-voltage training.
> +	 *
> +	 * 1. Write LMC(0)_EXT_CONFIG[VREFINT_SEQ_DESKEW] = 0.
> +	 */
> +	ext_config.u64 = lmc_rd(priv, CVMX_LMCX_EXT_CONFIG(if_num));
> +	ext_config.s.vrefint_seq_deskew = 0;
> +
> +	ddr_seq_print("Performing LMC sequence: vrefint_seq_deskew = %d\n",
> +		      ext_config.s.vrefint_seq_deskew);
> +
> +	lmc_wr(priv, CVMX_LMCX_EXT_CONFIG(if_num), ext_config.u64);
> +
> +	/*
> +	 * 2. Write LMC(0)_SEQ_CTL[SEQ_SEL] = 0x0a and
> +	 *    LMC(0)_SEQ_CTL[INIT_START] = 1.
> +	 *
> +	 * 3. Wait for LMC(0)_SEQ_CTL[SEQ_COMPLETE] to be set to 1.
> +	 */
> +	/* Start LMC Internal vref Training */
> +	oct3_ddr3_seq(priv, rank_mask, if_num, 0x0A);
> +}
> +
> +#define dbg_avg(format, ...)	// debug(format, ##__VA_ARGS__)
> +
> +static int process_samples_average(s16 *bytes, int num_samples,
> +				   int lmc, int lane_no)
> +{
> +	int i, sadj, sum = 0, ret, asum, trunc;
> +	s16 smin = 32767, smax = -32768;
> +	int nmin, nmax;
> +	//int rng;
> +
> +	dbg_avg("DBG_AVG%d.%d: ", lmc, lane_no);
> +
> +	for (i = 0; i < num_samples; i++) {
> +		sum += bytes[i];
> +		if (bytes[i] < smin)
> +			smin = bytes[i];
> +		if (bytes[i] > smax)
> +			smax = bytes[i];
> +		dbg_avg(" %3d", bytes[i]);
> +	}
> +
> +	nmin = 0;
> +	nmax = 0;
> +	for (i = 0; i < num_samples; i++) {
> +		if (bytes[i] == smin)
> +			nmin += 1;
> +		if (bytes[i] == smax)
> +			nmax += 1;
> +	}
> +	dbg_avg(" (min=%3d/%d, max=%3d/%d, range=%2d, samples=%2d)",
> +		smin, nmin, smax, nmax, rng, num_samples);
> +
> +	asum = sum - smin - smax;
> +
> +	sadj = divide_nint(asum * 10, (num_samples - 2));
> +
> +	trunc = asum / (num_samples - 2);
> +
> +	dbg_avg(" [%3d.%d, %3d]", sadj / 10, sadj % 10, trunc);
> +
> +	sadj = divide_nint(sadj, 10);
> +	if (trunc & 1)
> +		ret = trunc;
> +	else if (sadj & 1)
> +		ret = sadj;
> +	else
> +		ret = trunc + 1;
> +
> +	dbg_avg(" -> %3d\n", ret);
> +
> +	return ret;
> +}
> +
> +#define DEFAULT_SAT_RETRY_LIMIT    11	// 1 + 10 retries
> +
> +#define default_lock_retry_limit   20	// 20 retries
> +#define deskew_validation_delay    10000	// 10 millisecs
> +
> +static int perform_deskew_training(struct ddr_priv *priv, int rank_mask,
> +				   int if_num, int spd_rawcard_aorb)
> +{
> +	int unsaturated, locked;
> +	int sat_retries, sat_retries_limit;
> +	int lock_retries, lock_retries_total, lock_retries_limit;
> +	int print_first;
> +	int print_them_all;
> +	struct deskew_counts dsk_counts;
> +	union cvmx_lmcx_phy_ctl phy_ctl;
> +	char *s;
> +	int has_no_sat = octeon_is_cpuid(OCTEON_CN78XX_PASS2_X) ||
> +		octeon_is_cpuid(OCTEON_CNF75XX);
> +	int disable_bitval_retries = 1;	// default to disabled
> +
> +	debug("N0.LMC%d: Performing Deskew Training.\n", if_num);
> +
> +	sat_retries = 0;
> +	sat_retries_limit = (has_no_sat) ? 5 : DEFAULT_SAT_RETRY_LIMIT;
> +
> +	lock_retries_total = 0;
> +	unsaturated = 0;
> +	print_first = 1;	// print the first one
> +	// set to true for printing all normal deskew attempts
> +	print_them_all = 0;
> +
> +	// provide override for bitval_errs causing internal VREF retries
> +	s = env_get("ddr_disable_bitval_retries");
> +	if (s)
> +		disable_bitval_retries = !!simple_strtoul(s, NULL, 0);
> +
> +	lock_retries_limit = default_lock_retry_limit;
> +	if ((octeon_is_cpuid(OCTEON_CN78XX_PASS2_X)) ||
> +	    (octeon_is_cpuid(OCTEON_CN73XX)) ||
> +	    (octeon_is_cpuid(OCTEON_CNF75XX)))
> +		lock_retries_limit *= 2;	// give new chips twice as many
> +
> +	do {			/* while (sat_retries < sat_retry_limit) */
> +		/*
> +		 * 4.8.8 LMC Deskew Training
> +		 *
> +		 * LMC requires input-read-data deskew training.
> +		 *
> +		 * 1. Write LMC(0)_EXT_CONFIG[VREFINT_SEQ_DESKEW] = 1.
> +		 */
> +
> +		union cvmx_lmcx_ext_config ext_config;
> +
> +		ext_config.u64 = lmc_rd(priv, CVMX_LMCX_EXT_CONFIG(if_num));
> +		ext_config.s.vrefint_seq_deskew = 1;
> +
> +		ddr_seq_print
> +		    ("Performing LMC sequence: vrefint_seq_deskew = %d\n",
> +		     ext_config.s.vrefint_seq_deskew);
> +
> +		lmc_wr(priv, CVMX_LMCX_EXT_CONFIG(if_num), ext_config.u64);
> +
> +		/*
> +		 * 2. Write LMC(0)_SEQ_CTL[SEQ_SEL] = 0x0A and
> +		 *    LMC(0)_SEQ_CTL[INIT_START] = 1.
> +		 *
> +		 * 3. Wait for LMC(0)_SEQ_CTL[SEQ_COMPLETE] to be set to 1.
> +		 */
> +
> +		phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +		phy_ctl.s.phy_dsk_reset = 1;	/* RESET Deskew sequence */
> +		lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +
> +		/* LMC Deskew Training */
> +		oct3_ddr3_seq(priv, rank_mask, if_num, 0x0A);
> +
> +		lock_retries = 0;
> +
> +perform_deskew_training:
> +
> +		phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +		phy_ctl.s.phy_dsk_reset = 0;	/* Normal Deskew sequence */
> +		lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +
> +		/* LMC Deskew Training */
> +		oct3_ddr3_seq(priv, rank_mask, if_num, 0x0A);
> +
> +		// Moved this from validate_deskew_training
> +		/* Allow deskew results to stabilize before evaluating them. */
> +		udelay(deskew_validation_delay);
> +
> +		// Now go look at lock and saturation status...
> +		validate_deskew_training(priv, rank_mask, if_num, &dsk_counts,
> +					 print_first);
> +		// after printing the first and not doing them all, no more
> +		if (print_first && !print_them_all)
> +			print_first = 0;
> +
> +		unsaturated = (dsk_counts.saturated == 0);
> +		locked = (dsk_counts.unlocked == 0);
> +
> +		// only do locking retries if unsaturated or rawcard A or B,
> +		// otherwise full SAT retry
> +		if (unsaturated || (spd_rawcard_aorb && !has_no_sat)) {
> +			if (!locked) {	// and not locked
> +				lock_retries++;
> +				lock_retries_total++;
> +				if (lock_retries <= lock_retries_limit) {
> +					goto perform_deskew_training;
> +				} else {
> +					debug("N0.LMC%d: LOCK RETRIES failed after %d retries\n",
> +					      if_num, lock_retries_limit);
> +				}
> +			} else {
> +				// only print if we did try
> +				if (lock_retries_total > 0)
> +					debug("N0.LMC%d: LOCK RETRIES successful after %d retries\n",
> +					      if_num, lock_retries);
> +			}
> +		}		/* if (unsaturated || spd_rawcard_aorb) */
> +
> +		++sat_retries;
> +
> +		/*
> +		 * At this point, check for a DDR4 RDIMM that will not
> +		 * benefit from SAT retries; if so, exit
> +		 */
> +		if (spd_rawcard_aorb && !has_no_sat) {
> +			debug("N0.LMC%d: Deskew Training Loop: Exiting for RAWCARD == A or B.\n",
> +			      if_num);
> +			break;	// no sat or lock retries
> +		}
> +
> +	} while (!unsaturated && (sat_retries < sat_retries_limit));
> +
> +	debug("N0.LMC%d: Deskew Training %s. %d sat-retries, %d lock-retries\n",
> +	      if_num, (sat_retries >= DEFAULT_SAT_RETRY_LIMIT) ?
> +	      "Timed Out" : "Completed", sat_retries - 1, lock_retries_total);
> +
> +	// FIXME? add saturation to reasons for fault return - give it a
> +	// chance via Internal VREF
> +	// FIXME? add OPTIONAL bit value to reasons for fault return -
> +	// give it a chance via Internal VREF
> +	if (dsk_counts.nibrng_errs != 0 || dsk_counts.nibunl_errs != 0 ||
> +	    (dsk_counts.bitval_errs != 0 && !disable_bitval_retries) ||
> +	    !unsaturated) {
> +		debug("N0.LMC%d: Nibble or Saturation Error(s) found, returning FAULT\n",
> +		      if_num);
> +		// FIXME: do we want this output always for errors?
> +		validate_deskew_training(priv, rank_mask, if_num,
> +					 &dsk_counts, 1);
> +		return -1;	// we did retry locally, they did not help
> +	}
> +
> +	// NOTE: we (currently) always print one last training validation
> +	// before starting Read Leveling...
> +
> +	return 0;
> +}
> +
> +#define SCALING_FACTOR (1000)
> +
> +// NOTE: this gets called for 1-rank and 2-rank DIMMs in single-slot config
> +static int compute_vref_1slot_2rank(int rtt_wr, int rtt_park, int dqx_ctl,
> +				    int rank_count, int dram_connection)
> +{
> +	u64 reff_s;
> +	u64 rser_s = (dram_connection) ? 0 : 15;
> +	u64 vdd = 1200;
> +	u64 vref;
> +	// 99 == HiZ
> +	u64 rtt_wr_s = (((rtt_wr == 0) || rtt_wr == 99) ?
> +			1 * 1024 * 1024 : rtt_wr);
> +	u64 rtt_park_s = (((rtt_park == 0) || ((rank_count == 1) &&
> +					       (rtt_wr != 0))) ?
> +			  1 * 1024 * 1024 : rtt_park);
> +	u64 dqx_ctl_s = (dqx_ctl == 0 ? 1 * 1024 * 1024 : dqx_ctl);
> +	int vref_value;
> +	u64 rangepc = 6000;	// range1 base
> +	u64 vrefpc;
> +	int vref_range = 0;
> +
> +	reff_s = divide_nint((rtt_wr_s * rtt_park_s), (rtt_wr_s + rtt_park_s));
> +
> +	vref = (((rser_s + dqx_ctl_s) * SCALING_FACTOR) /
> +		(rser_s + dqx_ctl_s + reff_s)) + SCALING_FACTOR;
> +
> +	vref = (vref * vdd) / 2 / SCALING_FACTOR;
> +
> +	vrefpc = (vref * 100 * 100) / vdd;
> +
> +	if (vrefpc < rangepc) {	// < range1 base, use range2
> +		vref_range = 1 << 6;	// set bit A6 for range2
> +		rangepc = 4500;	// range2 base is 45%
> +	}
> +
> +	vref_value = divide_nint(vrefpc - rangepc, 65);
> +	if (vref_value < 0)
> +		vref_value = vref_range;	// set to base of range
> +	else
> +		vref_value |= vref_range;
> +
> +	debug("rtt_wr: %d, rtt_park: %d, dqx_ctl: %d, rank_count: %d\n",
> +	      rtt_wr, rtt_park, dqx_ctl, rank_count);
> +	debug("rtt_wr_s: %lld, rtt_park_s: %lld, dqx_ctl_s: %lld, vref_value: 0x%x, range: %d\n",
> +	      rtt_wr_s, rtt_park_s, dqx_ctl_s, vref_value ^ vref_range,
> +	      vref_range ? 2 : 1);
> +
> +	return vref_value;
> +}
> +
> +// NOTE: this gets called for 1-rank and 2-rank DIMMs in two-slot configs
> +static int compute_vref_2slot_2rank(int rtt_wr, int rtt_park_00,
> +				    int rtt_park_01,
> +				    int dqx_ctl, int rtt_nom,
> +				    int dram_connection)
> +{
> +	u64 rser = (dram_connection) ? 0 : 15;
> +	u64 vdd = 1200;
> +	u64 vl, vlp, vcm;
> +	u64 rd0, rd1, rpullup;
> +	// 99 == HiZ
> +	u64 rtt_wr_s = (((rtt_wr == 0) || rtt_wr == 99) ?
> +			1 * 1024 * 1024 : rtt_wr);
> +	u64 rtt_park_00_s = (rtt_park_00 == 0 ? 1 * 1024 * 1024 : rtt_park_00);
> +	u64 rtt_park_01_s = (rtt_park_01 == 0 ? 1 * 1024 * 1024 : rtt_park_01);
> +	u64 dqx_ctl_s = (dqx_ctl == 0 ? 1 * 1024 * 1024 : dqx_ctl);
> +	u64 rtt_nom_s = (rtt_nom == 0 ? 1 * 1024 * 1024 : rtt_nom);
> +	int vref_value;
> +	u64 rangepc = 6000;	// range1 base
> +	u64 vrefpc;
> +	int vref_range = 0;
> +
> +	// rd0 = (RTT_NOM (parallel) RTT_WR) +  =
> +	// ((RTT_NOM * RTT_WR) / (RTT_NOM + RTT_WR)) + RSER
> +	rd0 = divide_nint((rtt_nom_s * rtt_wr_s),
> +			  (rtt_nom_s + rtt_wr_s)) + rser;
> +
> +	// rd1 = (RTT_PARK_00 (parallel) RTT_PARK_01) + RSER =
> +	// ((RTT_PARK_00 * RTT_PARK_01) / (RTT_PARK_00 + RTT_PARK_01)) + RSER
> +	rd1 = divide_nint((rtt_park_00_s * rtt_park_01_s),
> +			  (rtt_park_00_s + rtt_park_01_s)) + rser;
> +
> +	// rpullup = rd0 (parallel) rd1 = (rd0 * rd1) / (rd0 + rd1)
> +	rpullup = divide_nint((rd0 * rd1), (rd0 + rd1));
> +
> +	// vl = (DQX_CTL / (DQX_CTL + rpullup)) * 1.2
> +	vl = divide_nint((dqx_ctl_s * vdd), (dqx_ctl_s + rpullup));
> +
> +	// vlp = ((RSER / rd0) * (1.2 - vl)) + vl
> +	vlp = divide_nint((rser * (vdd - vl)), rd0) + vl;
> +
> +	// vcm = (vlp + 1.2) / 2
> +	vcm = divide_nint((vlp + vdd), 2);
> +
> +	// vrefpc = (vcm / 1.2) * 100
> +	vrefpc = divide_nint((vcm * 100 * 100), vdd);
> +
> +	if (vrefpc < rangepc) {	// < range1 base, use range2
> +		vref_range = 1 << 6;	// set bit A6 for range2
> +		rangepc = 4500;	// range2 base is 45%
> +	}
> +
> +	vref_value = divide_nint(vrefpc - rangepc, 65);
> +	if (vref_value < 0)
> +		vref_value = vref_range;	// set to base of range
> +	else
> +		vref_value |= vref_range;
> +
> +	debug("rtt_wr:%d, rtt_park_00:%d, rtt_park_01:%d, dqx_ctl:%d, rtt_nom:%d, vref_value:%d (0x%x)\n",
> +	      rtt_wr, rtt_park_00, rtt_park_01, dqx_ctl, rtt_nom, vref_value,
> +	      vref_value);
> +
> +	return vref_value;
> +}
> +
> +// NOTE: only call this for DIMMs with 1 or 2 ranks, not 4.
> +static int compute_vref_val(struct ddr_priv *priv, int if_num, int rankx,
> +			    int dimm_count, int rank_count,
> +			    struct impedence_values *imp_values,
> +			    int is_stacked_die, int dram_connection)
> +{
> +	int computed_final_vref_value = 0;
> +	int enable_adjust = ENABLE_COMPUTED_VREF_ADJUSTMENT;
> +	const char *s;
> +	int rtt_wr, dqx_ctl, rtt_nom, index;
> +	union cvmx_lmcx_modereg_params1 lmc_modereg_params1;
> +	union cvmx_lmcx_modereg_params2 lmc_modereg_params2;
> +	union cvmx_lmcx_comp_ctl2 comp_ctl2;
> +	int rtt_park;
> +	int rtt_park_00;
> +	int rtt_park_01;
> +
> +	debug("N0.LMC%d.R%d: %s(...dram_connection = %d)\n",
> +	      if_num, rankx, __func__, dram_connection);
> +
> +	// allow some overrides...
> +	s = env_get("ddr_adjust_computed_vref");
> +	if (s) {
> +		enable_adjust = !!simple_strtoul(s, NULL, 0);
> +		if (!enable_adjust) {
> +			debug("N0.LMC%d.R%d: DISABLE adjustment of computed VREF\n",
> +			      if_num, rankx);
> +		}
> +	}
> +
> +	s = env_get("ddr_set_computed_vref");
> +	if (s) {
> +		int new_vref = simple_strtoul(s, NULL, 0);
> +
> +		debug("N0.LMC%d.R%d: OVERRIDE computed VREF to 0x%x (%d)\n",
> +		      if_num, rankx, new_vref, new_vref);
> +		return new_vref;
> +	}
> +
> +	/*
> +	 * Calculate an alternative to the measured vref value
> +	 * but only for configurations we know how to...
> +	 */
> +	// We have code for 2-rank DIMMs in both 1-slot or 2-slot configs,
> +	// and can use the 2-rank 1-slot code for 1-rank DIMMs in 1-slot
> +	// configs, and can use the 2-rank 2-slot code for 1-rank DIMMs
> +	// in 2-slot configs.
> +
> +	lmc_modereg_params1.u64 =
> +	    lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS1(if_num));
> +	lmc_modereg_params2.u64 =
> +	    lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS2(if_num));
> +	comp_ctl2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +	dqx_ctl = imp_values->dqx_strength[comp_ctl2.s.dqx_ctl];
> +
> +	// WR always comes from the current rank
> +	index = (lmc_modereg_params1.u64 >> (rankx * 12 + 5)) & 0x03;
> +	if (!octeon_is_cpuid(OCTEON_CN78XX_PASS1_X))
> +		index |= lmc_modereg_params1.u64 >> (51 + rankx - 2) & 0x04;
> +	rtt_wr = imp_values->rtt_wr_ohms[index];
> +
> +	// separate calculations for 1 vs 2 DIMMs per LMC
> +	if (dimm_count == 1) {
> +		// PARK comes from this rank if 1-rank, otherwise other rank
> +		index =
> +		    (lmc_modereg_params2.u64 >>
> +		     ((rankx ^ (rank_count - 1)) * 10 + 0)) & 0x07;
> +		rtt_park = imp_values->rtt_nom_ohms[index];
> +		computed_final_vref_value =
> +		    compute_vref_1slot_2rank(rtt_wr, rtt_park, dqx_ctl,
> +					     rank_count, dram_connection);
> +	} else {
> +		// get both PARK values from the other DIMM
> +		index =
> +		    (lmc_modereg_params2.u64 >> ((rankx ^ 0x02) * 10 + 0)) &
> +		    0x07;
> +		rtt_park_00 = imp_values->rtt_nom_ohms[index];
> +		index =
> +		    (lmc_modereg_params2.u64 >> ((rankx ^ 0x03) * 10 + 0)) &
> +		    0x07;
> +		rtt_park_01 = imp_values->rtt_nom_ohms[index];
> +		// NOM comes from this rank if 1-rank, otherwise other rank
> +		index =
> +		    (lmc_modereg_params1.u64 >>
> +		     ((rankx ^ (rank_count - 1)) * 12 + 9)) & 0x07;
> +		rtt_nom = imp_values->rtt_nom_ohms[index];
> +		computed_final_vref_value =
> +		    compute_vref_2slot_2rank(rtt_wr, rtt_park_00, rtt_park_01,
> +					     dqx_ctl, rtt_nom, dram_connection);
> +	}
> +
> +	if (enable_adjust) {
> +		union cvmx_lmcx_config lmc_config;
> +		union cvmx_lmcx_control lmc_control;
> +
> +		lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +		lmc_control.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +
> +		/*
> +		 *  New computed vref = existing computed vref – X
> +		 *
> +		 * The value of X is depending on different conditions.
> +		 * Both #122 and #139 are 2Rx4 RDIMM, while #124 is stacked
> +		 * die 2Rx4, so I conclude the results into two conditions:
> +		 *
> +		 * 1. Stacked Die: 2Rx4
> +		 * 1-slot: offset = 7. i, e New computed vref = existing
> +		 * computed vref – 7
> +		 * 2-slot: offset = 6
> +		 *
> +		 * 2. Regular: 2Rx4
> +		 * 1-slot: offset = 3
> +		 * 2-slot:  offset = 2
> +		 */
> +		// we know we never get called unless DDR4, so test just
> +		// the other conditions
> +		if (lmc_control.s.rdimm_ena == 1 &&
> +		    rank_count == 2 && lmc_config.s.mode_x4dev) {
> +			// it must first be RDIMM and 2-rank and x4
> +			int adj;
> +
> +			// now do according to stacked die or not...
> +			if (is_stacked_die)
> +				adj = (dimm_count == 1) ? -7 : -6;
> +			else
> +				adj = (dimm_count == 1) ? -3 : -2;
> +
> +			// we must have adjusted it, so print it out if
> +			// verbosity is right
> +			debug("N0.LMC%d.R%d: adjusting computed vref from %2d (0x%02x) to %2d (0x%02x)\n",
> +			      if_num, rankx, computed_final_vref_value,
> +			      computed_final_vref_value,
> +			      computed_final_vref_value + adj,
> +			      computed_final_vref_value + adj);
> +			computed_final_vref_value += adj;
> +		}
> +	}
> +
> +	return computed_final_vref_value;
> +}
> +
> +static void unpack_rlevel_settings(int if_bytemask, int ecc_ena,
> +				   struct rlevel_byte_data *rlevel_byte,
> +				   union cvmx_lmcx_rlevel_rankx lmc_rlevel_rank)
> +{
> +	if ((if_bytemask & 0xff) == 0xff) {
> +		if (ecc_ena) {
> +			rlevel_byte[8].delay = lmc_rlevel_rank.s.byte7;
> +			rlevel_byte[7].delay = lmc_rlevel_rank.s.byte6;
> +			rlevel_byte[6].delay = lmc_rlevel_rank.s.byte5;
> +			rlevel_byte[5].delay = lmc_rlevel_rank.s.byte4;
> +			/* ECC */
> +			rlevel_byte[4].delay = lmc_rlevel_rank.s.byte8;
> +		} else {
> +			rlevel_byte[7].delay = lmc_rlevel_rank.s.byte7;
> +			rlevel_byte[6].delay = lmc_rlevel_rank.s.byte6;
> +			rlevel_byte[5].delay = lmc_rlevel_rank.s.byte5;
> +			rlevel_byte[4].delay = lmc_rlevel_rank.s.byte4;
> +		}
> +	} else {
> +		rlevel_byte[8].delay = lmc_rlevel_rank.s.byte8;	/* unused */
> +		rlevel_byte[7].delay = lmc_rlevel_rank.s.byte7;	/* unused */
> +		rlevel_byte[6].delay = lmc_rlevel_rank.s.byte6;	/* unused */
> +		rlevel_byte[5].delay = lmc_rlevel_rank.s.byte5;	/* unused */
> +		rlevel_byte[4].delay = lmc_rlevel_rank.s.byte4;	/* ECC */
> +	}
> +
> +	rlevel_byte[3].delay = lmc_rlevel_rank.s.byte3;
> +	rlevel_byte[2].delay = lmc_rlevel_rank.s.byte2;
> +	rlevel_byte[1].delay = lmc_rlevel_rank.s.byte1;
> +	rlevel_byte[0].delay = lmc_rlevel_rank.s.byte0;
> +}
> +
> +static void pack_rlevel_settings(int if_bytemask, int ecc_ena,
> +				 struct rlevel_byte_data *rlevel_byte,
> +				 union cvmx_lmcx_rlevel_rankx
> +				 *final_rlevel_rank)
> +{
> +	union cvmx_lmcx_rlevel_rankx lmc_rlevel_rank = *final_rlevel_rank;
> +
> +	if ((if_bytemask & 0xff) == 0xff) {
> +		if (ecc_ena) {
> +			lmc_rlevel_rank.s.byte7 = rlevel_byte[8].delay;
> +			lmc_rlevel_rank.s.byte6 = rlevel_byte[7].delay;
> +			lmc_rlevel_rank.s.byte5 = rlevel_byte[6].delay;
> +			lmc_rlevel_rank.s.byte4 = rlevel_byte[5].delay;
> +			/* ECC */
> +			lmc_rlevel_rank.s.byte8 = rlevel_byte[4].delay;
> +		} else {
> +			lmc_rlevel_rank.s.byte7 = rlevel_byte[7].delay;
> +			lmc_rlevel_rank.s.byte6 = rlevel_byte[6].delay;
> +			lmc_rlevel_rank.s.byte5 = rlevel_byte[5].delay;
> +			lmc_rlevel_rank.s.byte4 = rlevel_byte[4].delay;
> +		}
> +	} else {
> +		lmc_rlevel_rank.s.byte8 = rlevel_byte[8].delay;
> +		lmc_rlevel_rank.s.byte7 = rlevel_byte[7].delay;
> +		lmc_rlevel_rank.s.byte6 = rlevel_byte[6].delay;
> +		lmc_rlevel_rank.s.byte5 = rlevel_byte[5].delay;
> +		lmc_rlevel_rank.s.byte4 = rlevel_byte[4].delay;
> +	}
> +
> +	lmc_rlevel_rank.s.byte3 = rlevel_byte[3].delay;
> +	lmc_rlevel_rank.s.byte2 = rlevel_byte[2].delay;
> +	lmc_rlevel_rank.s.byte1 = rlevel_byte[1].delay;
> +	lmc_rlevel_rank.s.byte0 = rlevel_byte[0].delay;
> +
> +	*final_rlevel_rank = lmc_rlevel_rank;
> +}
> +
> +/////////////////// These are the RLEVEL settings display routines
> +
> +// flags
> +#define WITH_NOTHING 0
> +#define WITH_SCORE   1
> +#define WITH_AVERAGE 2
> +#define WITH_FINAL   4
> +#define WITH_COMPUTE 8
> +
> +static void do_display_rl(int if_num,
> +			  union cvmx_lmcx_rlevel_rankx lmc_rlevel_rank,
> +			  int rank, int flags, int score)
> +{
> +	char score_buf[16];
> +	char *msg_buf;
> +	char hex_buf[20];
> +
> +	if (flags & WITH_SCORE) {
> +		snprintf(score_buf, sizeof(score_buf), "(%d)", score);
> +	} else {
> +		score_buf[0] = ' ';
> +		score_buf[1] = 0;
> +	}
> +
> +	if (flags & WITH_AVERAGE) {
> +		msg_buf = "  DELAY AVERAGES  ";
> +	} else if (flags & WITH_FINAL) {
> +		msg_buf = "  FINAL SETTINGS  ";
> +	} else if (flags & WITH_COMPUTE) {
> +		msg_buf = "  COMPUTED DELAYS ";
> +	} else {
> +		snprintf(hex_buf, sizeof(hex_buf), "0x%016llX",
> +			 (unsigned long long)lmc_rlevel_rank.u64);
> +		msg_buf = hex_buf;
> +	}
> +
> +	debug("N0.LMC%d.R%d: Rlevel Rank %#4x, %s  : %5d %5d %5d %5d %5d %5d %5d %5d %5d %s\n",
> +	      if_num, rank, lmc_rlevel_rank.s.status, msg_buf,
> +	      lmc_rlevel_rank.s.byte8, lmc_rlevel_rank.s.byte7,
> +	      lmc_rlevel_rank.s.byte6, lmc_rlevel_rank.s.byte5,
> +	      lmc_rlevel_rank.s.byte4, lmc_rlevel_rank.s.byte3,
> +	      lmc_rlevel_rank.s.byte2, lmc_rlevel_rank.s.byte1,
> +	      lmc_rlevel_rank.s.byte0, score_buf);
> +}
> +
> +static void display_rl(int if_num,
> +		       union cvmx_lmcx_rlevel_rankx lmc_rlevel_rank, int rank)
> +{
> +	do_display_rl(if_num, lmc_rlevel_rank, rank, 0, 0);
> +}
> +
> +static void display_rl_with_score(int if_num,
> +				  union cvmx_lmcx_rlevel_rankx lmc_rlevel_rank,
> +				  int rank, int score)
> +{
> +	do_display_rl(if_num, lmc_rlevel_rank, rank, 1, score);
> +}
> +
> +static void display_rl_with_final(int if_num,
> +				  union cvmx_lmcx_rlevel_rankx lmc_rlevel_rank,
> +				  int rank)
> +{
> +	do_display_rl(if_num, lmc_rlevel_rank, rank, 4, 0);
> +}
> +
> +static void display_rl_with_computed(int if_num,
> +				     union cvmx_lmcx_rlevel_rankx
> +				     lmc_rlevel_rank, int rank, int score)
> +{
> +	do_display_rl(if_num, lmc_rlevel_rank, rank, 9, score);
> +}
> +
> +// flag values
> +#define WITH_RODT_BLANK      0
> +#define WITH_RODT_SKIPPING   1
> +#define WITH_RODT_BESTROW    2
> +#define WITH_RODT_BESTSCORE  3
> +// control
> +#define SKIP_SKIPPING 1
> +
> +static const char *with_rodt_canned_msgs[4] = {
> +	"          ", "SKIPPING  ", "BEST ROW  ", "BEST SCORE"
> +};
> +
> +static void display_rl_with_rodt(int if_num,
> +				 union cvmx_lmcx_rlevel_rankx lmc_rlevel_rank,
> +				 int rank, int score,
> +				 int nom_ohms, int rodt_ohms, int flag)
> +{
> +	const char *msg_buf;
> +	char set_buf[20];
> +
> +#if SKIP_SKIPPING
> +	if (flag == WITH_RODT_SKIPPING)
> +		return;
> +#endif
> +
> +	msg_buf = with_rodt_canned_msgs[flag];
> +	if (nom_ohms < 0) {
> +		snprintf(set_buf, sizeof(set_buf), "    RODT %3d    ",
> +			 rodt_ohms);
> +	} else {
> +		snprintf(set_buf, sizeof(set_buf), "NOM %3d RODT %3d", nom_ohms,
> +			 rodt_ohms);
> +	}
> +
> +	debug("N0.LMC%d.R%d: Rlevel %s   %s  : %5d %5d %5d %5d %5d %5d %5d %5d %5d (%d)\n",
> +	      if_num, rank, set_buf, msg_buf, lmc_rlevel_rank.s.byte8,
> +	      lmc_rlevel_rank.s.byte7, lmc_rlevel_rank.s.byte6,
> +	      lmc_rlevel_rank.s.byte5, lmc_rlevel_rank.s.byte4,
> +	      lmc_rlevel_rank.s.byte3, lmc_rlevel_rank.s.byte2,
> +	      lmc_rlevel_rank.s.byte1, lmc_rlevel_rank.s.byte0, score);
> +}
> +
> +static void do_display_wl(int if_num,
> +			  union cvmx_lmcx_wlevel_rankx lmc_wlevel_rank,
> +			  int rank, int flags)
> +{
> +	char *msg_buf;
> +	char hex_buf[20];
> +
> +	if (flags & WITH_FINAL) {
> +		msg_buf = "  FINAL SETTINGS  ";
> +	} else {
> +		snprintf(hex_buf, sizeof(hex_buf), "0x%016llX",
> +			 (unsigned long long)lmc_wlevel_rank.u64);
> +		msg_buf = hex_buf;
> +	}
> +
> +	debug("N0.LMC%d.R%d: Wlevel Rank %#4x, %s  : %5d %5d %5d %5d %5d %5d %5d %5d %5d\n",
> +	      if_num, rank, lmc_wlevel_rank.s.status, msg_buf,
> +	      lmc_wlevel_rank.s.byte8, lmc_wlevel_rank.s.byte7,
> +	      lmc_wlevel_rank.s.byte6, lmc_wlevel_rank.s.byte5,
> +	      lmc_wlevel_rank.s.byte4, lmc_wlevel_rank.s.byte3,
> +	      lmc_wlevel_rank.s.byte2, lmc_wlevel_rank.s.byte1,
> +	      lmc_wlevel_rank.s.byte0);
> +}
> +
> +static void display_wl(int if_num,
> +		       union cvmx_lmcx_wlevel_rankx lmc_wlevel_rank, int rank)
> +{
> +	do_display_wl(if_num, lmc_wlevel_rank, rank, WITH_NOTHING);
> +}
> +
> +static void display_wl_with_final(int if_num,
> +				  union cvmx_lmcx_wlevel_rankx lmc_wlevel_rank,
> +				  int rank)
> +{
> +	do_display_wl(if_num, lmc_wlevel_rank, rank, WITH_FINAL);
> +}
> +
> +// pretty-print bitmask adjuster
> +static u64 ppbm(u64 bm)
> +{
> +	if (bm != 0ul) {
> +		while ((bm & 0x0fful) == 0ul)
> +			bm >>= 4;
> +	}
> +
> +	return bm;
> +}
> +
> +// xlate PACKED index to UNPACKED index to use with rlevel_byte
> +#define XPU(i, e) (((i) < 4) ? (i) : (((i) < 8) ? (i) + (e) : 4))
> +// xlate UNPACKED index to PACKED index to use with rlevel_bitmask
> +#define XUP(i, e) (((i) < 4) ? (i) : (e) ? (((i) > 4) ? (i) - 1 : 8) : (i))
> +
> +// flag values
> +#define WITH_WL_BITMASKS      0
> +#define WITH_RL_BITMASKS      1
> +#define WITH_RL_MASK_SCORES   2
> +#define WITH_RL_SEQ_SCORES    3
> +
> +static void do_display_bm(int if_num, int rank, void *bm,
> +			  int flags, int ecc)
> +{
> +	if (flags == WITH_WL_BITMASKS) {
> +		// wlevel_bitmask array in PACKED index order, so just
> +		// print them
> +		int *bitmasks = (int *)bm;
> +
> +		debug("N0.LMC%d.R%d: Wlevel Debug Bitmasks                 : %05x %05x %05x %05x %05x %05x %05x %05x %05x\n",
> +		      if_num, rank, bitmasks[8], bitmasks[7], bitmasks[6],
> +		      bitmasks[5], bitmasks[4], bitmasks[3], bitmasks[2],
> +		      bitmasks[1], bitmasks[0]
> +			);
> +	} else if (flags == WITH_RL_BITMASKS) {
> +		// rlevel_bitmask array in PACKED index order, so just
> +		// print them
> +		struct rlevel_bitmask *rlevel_bitmask =
> +			(struct rlevel_bitmask *)bm;
> +
> +		debug("N0.LMC%d.R%d: Rlevel Debug Bitmasks        8:0      : %05llx %05llx %05llx %05llx %05llx %05llx %05llx %05llx %05llx\n",
> +		      if_num, rank, ppbm(rlevel_bitmask[8].bm),
> +		      ppbm(rlevel_bitmask[7].bm), ppbm(rlevel_bitmask[6].bm),
> +		      ppbm(rlevel_bitmask[5].bm), ppbm(rlevel_bitmask[4].bm),
> +		      ppbm(rlevel_bitmask[3].bm), ppbm(rlevel_bitmask[2].bm),
> +		      ppbm(rlevel_bitmask[1].bm), ppbm(rlevel_bitmask[0].bm)
> +			);
> +	} else if (flags == WITH_RL_MASK_SCORES) {
> +		// rlevel_bitmask array in PACKED index order, so just
> +		// print them
> +		struct rlevel_bitmask *rlevel_bitmask =
> +			(struct rlevel_bitmask *)bm;
> +
> +		debug("N0.LMC%d.R%d: Rlevel Debug Bitmask Scores  8:0      : %5d %5d %5d %5d %5d %5d %5d %5d %5d\n",
> +		      if_num, rank, rlevel_bitmask[8].errs,
> +		      rlevel_bitmask[7].errs, rlevel_bitmask[6].errs,
> +		      rlevel_bitmask[5].errs, rlevel_bitmask[4].errs,
> +		      rlevel_bitmask[3].errs, rlevel_bitmask[2].errs,
> +		      rlevel_bitmask[1].errs, rlevel_bitmask[0].errs);
> +	} else if (flags == WITH_RL_SEQ_SCORES) {
> +		// rlevel_byte array in UNPACKED index order, so xlate
> +		// and print them
> +		struct rlevel_byte_data *rlevel_byte =
> +			(struct rlevel_byte_data *)bm;
> +
> +		debug("N0.LMC%d.R%d: Rlevel Debug Non-seq Scores  8:0      : %5d %5d %5d %5d %5d %5d %5d %5d %5d\n",
> +		      if_num, rank, rlevel_byte[XPU(8, ecc)].sqerrs,
> +		      rlevel_byte[XPU(7, ecc)].sqerrs,
> +		      rlevel_byte[XPU(6, ecc)].sqerrs,
> +		      rlevel_byte[XPU(5, ecc)].sqerrs,
> +		      rlevel_byte[XPU(4, ecc)].sqerrs,
> +		      rlevel_byte[XPU(3, ecc)].sqerrs,
> +		      rlevel_byte[XPU(2, ecc)].sqerrs,
> +		      rlevel_byte[XPU(1, ecc)].sqerrs,
> +		      rlevel_byte[XPU(0, ecc)].sqerrs);
> +	}
> +}
> +
> +static void display_wl_bm(int if_num, int rank, int *bitmasks)
> +{
> +	do_display_bm(if_num, rank, (void *)bitmasks, WITH_WL_BITMASKS, 0);
> +}
> +
> +static void display_rl_bm(int if_num, int rank,
> +			  struct rlevel_bitmask *bitmasks, int ecc_ena)
> +{
> +	do_display_bm(if_num, rank, (void *)bitmasks, WITH_RL_BITMASKS,
> +		      ecc_ena);
> +}
> +
> +static void display_rl_bm_scores(int if_num, int rank,
> +				 struct rlevel_bitmask *bitmasks, int ecc_ena)
> +{
> +	do_display_bm(if_num, rank, (void *)bitmasks, WITH_RL_MASK_SCORES,
> +		      ecc_ena);
> +}
> +
> +static void display_rl_seq_scores(int if_num, int rank,
> +				  struct rlevel_byte_data *bytes, int ecc_ena)
> +{
> +	do_display_bm(if_num, rank, (void *)bytes, WITH_RL_SEQ_SCORES, ecc_ena);
> +}
> +
> +#define RODT_OHMS_COUNT        8
> +#define RTT_NOM_OHMS_COUNT     8
> +#define RTT_NOM_TABLE_COUNT    8
> +#define RTT_WR_OHMS_COUNT      8
> +#define DIC_OHMS_COUNT         3
> +#define DRIVE_STRENGTH_COUNT  15
> +
> +static unsigned char ddr4_rodt_ohms[RODT_OHMS_COUNT] = {
> +	0, 40, 60, 80, 120, 240, 34, 48 };
> +static unsigned char ddr4_rtt_nom_ohms[RTT_NOM_OHMS_COUNT] = {
> +	0, 60, 120, 40, 240, 48, 80, 34 };
> +static unsigned char ddr4_rtt_nom_table[RTT_NOM_TABLE_COUNT] = {
> +	0, 4, 2, 6, 1, 5, 3, 7 };
> +// setting HiZ ohms to 99 for computed vref
> +static unsigned char ddr4_rtt_wr_ohms[RTT_WR_OHMS_COUNT] = {
> +	0, 120, 240, 99, 80 };
> +static unsigned char ddr4_dic_ohms[DIC_OHMS_COUNT] = { 34, 48 };
> +static short ddr4_drive_strength[DRIVE_STRENGTH_COUNT] = {
> +	0, 0, 26, 30, 34, 40, 48, 68, 0, 0, 0, 0, 0, 0, 0 };
> +static short ddr4_dqx_strength[DRIVE_STRENGTH_COUNT] = {
> +	0, 24, 27, 30, 34, 40, 48, 60, 0, 0, 0, 0, 0, 0, 0 };
> +struct impedence_values ddr4_impedence_val = {
> +	.rodt_ohms = ddr4_rodt_ohms,
> +	.rtt_nom_ohms = ddr4_rtt_nom_ohms,
> +	.rtt_nom_table = ddr4_rtt_nom_table,
> +	.rtt_wr_ohms = ddr4_rtt_wr_ohms,
> +	.dic_ohms = ddr4_dic_ohms,
> +	.drive_strength = ddr4_drive_strength,
> +	.dqx_strength = ddr4_dqx_strength,
> +};
> +
> +static unsigned char ddr3_rodt_ohms[RODT_OHMS_COUNT] = {
> +	0, 20, 30, 40, 60, 120, 0, 0 };
> +static unsigned char ddr3_rtt_nom_ohms[RTT_NOM_OHMS_COUNT] = {
> +	0, 60, 120, 40, 20, 30, 0, 0 };
> +static unsigned char ddr3_rtt_nom_table[RTT_NOM_TABLE_COUNT] = {
> +	0, 2, 1, 3, 5, 4, 0, 0 };
> +static unsigned char ddr3_rtt_wr_ohms[RTT_WR_OHMS_COUNT] = { 0, 60, 120 };
> +static unsigned char ddr3_dic_ohms[DIC_OHMS_COUNT] = { 40, 34 };
> +static short ddr3_drive_strength[DRIVE_STRENGTH_COUNT] = {
> +	0, 24, 27, 30, 34, 40, 48, 60, 0, 0, 0, 0, 0, 0, 0 };
> +static struct impedence_values ddr3_impedence_val = {
> +	.rodt_ohms = ddr3_rodt_ohms,
> +	.rtt_nom_ohms = ddr3_rtt_nom_ohms,
> +	.rtt_nom_table = ddr3_rtt_nom_table,
> +	.rtt_wr_ohms = ddr3_rtt_wr_ohms,
> +	.dic_ohms = ddr3_dic_ohms,
> +	.drive_strength = ddr3_drive_strength,
> +	.dqx_strength = ddr3_drive_strength,
> +};
> +
> +static u64 hertz_to_psecs(u64 hertz)
> +{
> +	/* Clock in psecs */
> +	return divide_nint((u64)1000 * 1000 * 1000 * 1000, hertz);
> +}
> +
> +#define DIVIDEND_SCALE 1000	/* Scale to avoid rounding error. */
> +
> +static u64 psecs_to_mts(u64 psecs)
> +{
> +	return divide_nint(divide_nint((u64)(2 * 1000000 * DIVIDEND_SCALE),
> +				       psecs), DIVIDEND_SCALE);
> +}
> +
> +#define WITHIN(v, b, m) (((v) >= ((b) - (m))) && ((v) <= ((b) + (m))))
> +
> +static unsigned long pretty_psecs_to_mts(u64 psecs)
> +{
> +	u64 ret = 0;		// default to error
> +
> +	if (WITHIN(psecs, 2500, 1))
> +		ret = 800;
> +	else if (WITHIN(psecs, 1875, 1))
> +		ret = 1066;
> +	else if (WITHIN(psecs, 1500, 1))
> +		ret = 1333;
> +	else if (WITHIN(psecs, 1250, 1))
> +		ret = 1600;
> +	else if (WITHIN(psecs, 1071, 1))
> +		ret = 1866;
> +	else if (WITHIN(psecs, 937, 1))
> +		ret = 2133;
> +	else if (WITHIN(psecs, 833, 1))
> +		ret = 2400;
> +	else if (WITHIN(psecs, 750, 1))
> +		ret = 2666;
> +	return ret;
> +}
> +
> +static u64 mts_to_hertz(u64 mts)
> +{
> +	return ((mts * 1000 * 1000) / 2);
> +}
> +
> +static int compute_rc3x(int64_t tclk_psecs)
> +{
> +	long speed;
> +	long tclk_psecs_min, tclk_psecs_max;
> +	long data_rate_mhz, data_rate_mhz_min, data_rate_mhz_max;
> +	int rc3x;
> +
> +#define ENCODING_BASE 1240
> +
> +	data_rate_mhz = psecs_to_mts(tclk_psecs);
> +
> +	/*
> +	 * 2400 MT/s is a special case. Using integer arithmetic it rounds
> +	 * from 833 psecs to 2401 MT/s. Force it to 2400 to pick the
> +	 * proper setting from the table.
> +	 */
> +	if (tclk_psecs == 833)
> +		data_rate_mhz = 2400;
> +
> +	for (speed = ENCODING_BASE; speed < 3200; speed += 20) {
> +		int error = 0;
> +
> +		/* Clock in psecs */
> +		tclk_psecs_min = hertz_to_psecs(mts_to_hertz(speed + 00));
> +		/* Clock in psecs */
> +		tclk_psecs_max = hertz_to_psecs(mts_to_hertz(speed + 18));
> +
> +		data_rate_mhz_min = psecs_to_mts(tclk_psecs_min);
> +		data_rate_mhz_max = psecs_to_mts(tclk_psecs_max);
> +
> +		/* Force alingment to multiple to avound rounding errors. */
> +		data_rate_mhz_min = ((data_rate_mhz_min + 18) / 20) * 20;
> +		data_rate_mhz_max = ((data_rate_mhz_max + 18) / 20) * 20;
> +
> +		error += (speed + 00 != data_rate_mhz_min);
> +		error += (speed + 20 != data_rate_mhz_max);
> +
> +		rc3x = (speed - ENCODING_BASE) / 20;
> +
> +		if (data_rate_mhz <= (speed + 20))
> +			break;
> +	}
> +
> +	return rc3x;
> +}
> +
> +/*
> + * static global variables needed, so that functions (loops) can be
> + * restructured from the main huge function. Its not elegant, but the
> + * only way to break the original functions like init_octeon3_ddr3_interface()
> + * into separate logical smaller functions with less indentation levels.
> + */
> +static int if_num __section(".data");
> +static u32 if_mask __section(".data");
> +static int ddr_hertz __section(".data");
> +
> +static struct ddr_conf *ddr_conf __section(".data");
> +static const struct dimm_odt_config *odt_1rank_config __section(".data");
> +static const struct dimm_odt_config *odt_2rank_config __section(".data");
> +static const struct dimm_odt_config *odt_4rank_config __section(".data");
> +static struct dimm_config *dimm_config_table __section(".data");
> +static const struct dimm_odt_config *odt_config __section(".data");
> +static const struct ddr3_custom_config *c_cfg __section(".data");
> +
> +static int odt_idx __section(".data");
> +
> +static ulong tclk_psecs __section(".data");
> +static ulong eclk_psecs __section(".data");
> +
> +static int row_bits __section(".data");
> +static int col_bits __section(".data");
> +static int num_banks __section(".data");
> +static int num_ranks __section(".data");
> +static int dram_width __section(".data");
> +static int dimm_count __section(".data");
> +/* Accumulate and report all the errors before giving up */
> +static int fatal_error __section(".data");
> +/* Flag that indicates safe DDR settings should be used */
> +static int safe_ddr_flag __section(".data");
> +/* Octeon II Default: 64bit interface width */
> +static int if_64b __section(".data");
> +static int if_bytemask __section(".data");
> +static u32 mem_size_mbytes __section(".data");
> +static unsigned int didx __section(".data");
> +static int bank_bits __section(".data");
> +static int bunk_enable __section(".data");
> +static int rank_mask __section(".data");
> +static int column_bits_start __section(".data");
> +static int row_lsb __section(".data");
> +static int pbank_lsb __section(".data");
> +static int use_ecc __section(".data");
> +static int mtb_psec __section(".data");
> +static short ftb_dividend __section(".data");
> +static short ftb_divisor __section(".data");
> +static int taamin __section(".data");
> +static int tckmin __section(".data");
> +static int cl __section(".data");
> +static int min_cas_latency __section(".data");
> +static int max_cas_latency __section(".data");
> +static int override_cas_latency __section(".data");
> +static int ddr_rtt_nom_auto __section(".data");
> +static int ddr_rodt_ctl_auto __section(".data");
> +
> +static int spd_addr __section(".data");
> +static int spd_org __section(".data");
> +static int spd_banks __section(".data");
> +static int spd_rdimm __section(".data");
> +static int spd_dimm_type __section(".data");
> +static int spd_ecc __section(".data");
> +static u32 spd_cas_latency __section(".data");
> +static int spd_mtb_dividend __section(".data");
> +static int spd_mtb_divisor __section(".data");
> +static int spd_tck_min __section(".data");
> +static int spd_taa_min __section(".data");
> +static int spd_twr __section(".data");
> +static int spd_trcd __section(".data");
> +static int spd_trrd __section(".data");
> +static int spd_trp __section(".data");
> +static int spd_tras __section(".data");
> +static int spd_trc __section(".data");
> +static int spd_trfc __section(".data");
> +static int spd_twtr __section(".data");
> +static int spd_trtp __section(".data");
> +static int spd_tfaw __section(".data");
> +static int spd_addr_mirror __section(".data");
> +static int spd_package __section(".data");
> +static int spd_rawcard __section(".data");
> +static int spd_rawcard_aorb __section(".data");
> +static int spd_rdimm_registers __section(".data");
> +static int spd_thermal_sensor __section(".data");
> +
> +static int is_stacked_die __section(".data");
> +static int is_3ds_dimm __section(".data");
> +// 3DS: logical ranks per package rank
> +static int lranks_per_prank __section(".data");
> +// 3DS: logical ranks bits
> +static int lranks_bits __section(".data");
> +// in Mbits; only used for 3DS
> +static int die_capacity __section(".data");
> +
> +static enum ddr_type ddr_type __section(".data");
> +
> +static int twr __section(".data");
> +static int trcd __section(".data");
> +static int trrd __section(".data");
> +static int trp __section(".data");
> +static int tras __section(".data");
> +static int trc __section(".data");
> +static int trfc __section(".data");
> +static int twtr __section(".data");
> +static int trtp __section(".data");
> +static int tfaw __section(".data");
> +
> +static int ddr4_tckavgmin __section(".data");
> +static int ddr4_tckavgmax __section(".data");
> +static int ddr4_trdcmin __section(".data");
> +static int ddr4_trpmin __section(".data");
> +static int ddr4_trasmin __section(".data");
> +static int ddr4_trcmin __section(".data");
> +static int ddr4_trfc1min __section(".data");
> +static int ddr4_trfc2min __section(".data");
> +static int ddr4_trfc4min __section(".data");
> +static int ddr4_tfawmin __section(".data");
> +static int ddr4_trrd_smin __section(".data");
> +static int ddr4_trrd_lmin __section(".data");
> +static int ddr4_tccd_lmin __section(".data");
> +
> +static int wl_mask_err __section(".data");
> +static int wl_loops __section(".data");
> +static int default_rtt_nom[4] __section(".data");
> +static int dyn_rtt_nom_mask __section(".data");
> +static struct impedence_values *imp_val __section(".data");
> +static char default_rodt_ctl __section(".data");
> +// default to disabled (ie, try LMC restart, not chip reset)
> +static int ddr_disable_chip_reset __section(".data");
> +static const char *dimm_type_name __section(".data");
> +static int match_wl_rtt_nom __section(".data");
> +
> +#if SWL_TRY_HWL_ALT
> +struct hwl_alt_by_rank {
> +	u16 hwl_alt_mask;	// mask of bytelanes with alternate
> +	u16 hwl_alt_delay[9];	// bytelane alternate avail if mask=1
> +};
> +
> +static struct hwl_alt_by_rank hwl_alts[4] __section(".data");
> +#endif /* SWL_TRY_HWL_ALT */
> +
> +#define DEFAULT_INTERNAL_VREF_TRAINING_LIMIT 3	// was: 5
> +static int internal_retries __section(".data");
> +
> +static int deskew_training_errors __section(".data");
> +static struct deskew_counts deskew_training_results __section(".data");
> +static int disable_deskew_training __section(".data");
> +static int restart_if_dsk_incomplete __section(".data");
> +static int dac_eval_retries __section(".data");
> +static int dac_settings[9] __section(".data");
> +static int num_samples __section(".data");
> +static int sample __section(".data");
> +static int lane __section(".data");
> +static int last_lane __section(".data");
> +static int total_dac_eval_retries __section(".data");
> +static int dac_eval_exhausted __section(".data");
> +
> +#define DEFAULT_DAC_SAMPLES 7	// originally was 5
> +#define DAC_RETRIES_LIMIT   2
> +
> +struct bytelane_sample {
> +	s16 bytes[DEFAULT_DAC_SAMPLES];
> +};
> +
> +static struct bytelane_sample lanes[9] __section(".data");
> +
> +static char disable_sequential_delay_check __section(".data");
> +static int wl_print __section(".data");
> +
> +#if ALLOW_BY_RANK_INIT
> +static int enable_by_rank_init __section(".data");
> +static int saved_rank_mask __section(".data");
> +static int by_rank __section(".data");
> +static struct deskew_data rank_dsk[4] __section(".data");
> +static struct dac_data rank_dac[4] __section(".data");
> +#endif /* ALLOW_BY_RANK_INIT */
> +
> +// todo: perhaps remove node at some time completely?
> +static int node __section(".data");
> +static int base_cl __section(".data");
> +
> +/* Parameters from DDR3 Specifications */
> +#define DDR3_TREFI         7800000	/* 7.8 us */
> +#define DDR3_ZQCS          80000ull	/* 80 ns */
> +#define DDR3_ZQCS_INTERNAL 1280000000ull	/* 128ms/100 */
> +#define DDR3_TCKE          5000	/* 5 ns */
> +#define DDR3_TMRD          4	/* 4 nCK */
> +#define DDR3_TDLLK         512	/* 512 nCK */
> +#define DDR3_TMPRR         1	/* 1 nCK */
> +#define DDR3_TWLMRD        40	/* 40 nCK */
> +#define DDR3_TWLDQSEN      25	/* 25 nCK */
> +
> +/* Parameters from DDR4 Specifications */
> +#define DDR4_TMRD          8	/* 8 nCK */
> +#define DDR4_TDLLK         768	/* 768 nCK */
> +
> +static void lmc_config(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_config cfg;
> +	char *s;
> +
> +	cfg.u64 = 0;
> +
> +	cfg.cn78xx.ecc_ena = use_ecc;
> +	cfg.cn78xx.row_lsb = encode_row_lsb_ddr3(row_lsb);
> +	cfg.cn78xx.pbank_lsb = encode_pbank_lsb_ddr3(pbank_lsb);
> +
> +	cfg.cn78xx.idlepower = 0;	/* Disabled */
> +
> +	s = lookup_env(priv, "ddr_idlepower");
> +	if (s)
> +		cfg.cn78xx.idlepower = simple_strtoul(s, NULL, 0);
> +
> +	cfg.cn78xx.forcewrite = 0;	/* Disabled */
> +	/* Include memory reference address in the ECC */
> +	cfg.cn78xx.ecc_adr = 1;
> +
> +	s = lookup_env(priv, "ddr_ecc_adr");
> +	if (s)
> +		cfg.cn78xx.ecc_adr = simple_strtoul(s, NULL, 0);
> +
> +	cfg.cn78xx.reset = 0;
> +
> +	/*
> +	 * Program LMC0_CONFIG[24:18], ref_zqcs_int(6:0) to
> +	 * RND-DN(tREFI/clkPeriod/512) Program LMC0_CONFIG[36:25],
> +	 * ref_zqcs_int(18:7) to
> +	 * RND-DN(ZQCS_Interval/clkPeriod/(512*128)). Note that this
> +	 * value should always be greater than 32, to account for
> +	 * resistor calibration delays.
> +	 */
> +
> +	cfg.cn78xx.ref_zqcs_int = ((DDR3_TREFI / tclk_psecs / 512) & 0x7f);
> +	cfg.cn78xx.ref_zqcs_int |=
> +		((max(33ull, (DDR3_ZQCS_INTERNAL / (tclk_psecs / 100) /
> +			      (512 * 128))) & 0xfff) << 7);
> +
> +	cfg.cn78xx.early_dqx = 1;	/* Default to enabled */
> +
> +	s = lookup_env(priv, "ddr_early_dqx");
> +	if (!s)
> +		s = lookup_env(priv, "ddr%d_early_dqx", if_num);
> +
> +	if (s)
> +		cfg.cn78xx.early_dqx = simple_strtoul(s, NULL, 0);
> +
> +	cfg.cn78xx.sref_with_dll = 0;
> +
> +	cfg.cn78xx.rank_ena = bunk_enable;
> +	cfg.cn78xx.rankmask = rank_mask;	/* Set later */
> +	cfg.cn78xx.mirrmask = (spd_addr_mirror << 1 | spd_addr_mirror << 3) &
> +		rank_mask;
> +	/* Set once and don't change it. */
> +	cfg.cn78xx.init_status = rank_mask;
> +	cfg.cn78xx.early_unload_d0_r0 = 0;
> +	cfg.cn78xx.early_unload_d0_r1 = 0;
> +	cfg.cn78xx.early_unload_d1_r0 = 0;
> +	cfg.cn78xx.early_unload_d1_r1 = 0;
> +	cfg.cn78xx.scrz = 0;
> +	if (octeon_is_cpuid(OCTEON_CN70XX))
> +		cfg.cn78xx.mode32b = 1;	/* Read-only. Always 1. */
> +	cfg.cn78xx.mode_x4dev = (dram_width == 4) ? 1 : 0;
> +	cfg.cn78xx.bg2_enable = ((ddr_type == DDR4_DRAM) &&
> +				 (dram_width == 16)) ? 0 : 1;
> +
> +	s = lookup_env_ull(priv, "ddr_config");
> +	if (s)
> +		cfg.u64 = simple_strtoull(s, NULL, 0);
> +	debug("LMC_CONFIG                                    : 0x%016llx\n",
> +	      cfg.u64);
> +	lmc_wr(priv, CVMX_LMCX_CONFIG(if_num), cfg.u64);
> +}
> +
> +static void lmc_control(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_control ctrl;
> +	char *s;
> +
> +	ctrl.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +	ctrl.s.rdimm_ena = spd_rdimm;
> +	ctrl.s.bwcnt = 0;	/* Clear counter later */
> +	if (spd_rdimm)
> +		ctrl.s.ddr2t = (safe_ddr_flag ? 1 : c_cfg->ddr2t_rdimm);
> +	else
> +		ctrl.s.ddr2t = (safe_ddr_flag ? 1 : c_cfg->ddr2t_udimm);
> +	ctrl.s.pocas = 0;
> +	ctrl.s.fprch2 = (safe_ddr_flag ? 2 : c_cfg->fprch2);
> +	ctrl.s.throttle_rd = safe_ddr_flag ? 1 : 0;
> +	ctrl.s.throttle_wr = safe_ddr_flag ? 1 : 0;
> +	ctrl.s.inorder_rd = safe_ddr_flag ? 1 : 0;
> +	ctrl.s.inorder_wr = safe_ddr_flag ? 1 : 0;
> +	ctrl.s.elev_prio_dis = safe_ddr_flag ? 1 : 0;
> +	/* discards writes to addresses that don't exist in the DRAM */
> +	ctrl.s.nxm_write_en = 0;
> +	ctrl.s.max_write_batch = 8;
> +	ctrl.s.xor_bank = 1;
> +	ctrl.s.auto_dclkdis = 1;
> +	ctrl.s.int_zqcs_dis = 0;
> +	ctrl.s.ext_zqcs_dis = 0;
> +	ctrl.s.bprch = 1;
> +	ctrl.s.wodt_bprch = 1;
> +	ctrl.s.rodt_bprch = 1;
> +
> +	s = lookup_env(priv, "ddr_xor_bank");
> +	if (s)
> +		ctrl.s.xor_bank = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_2t");
> +	if (s)
> +		ctrl.s.ddr2t = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_fprch2");
> +	if (s)
> +		ctrl.s.fprch2 = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_bprch");
> +	if (s)
> +		ctrl.s.bprch = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_wodt_bprch");
> +	if (s)
> +		ctrl.s.wodt_bprch = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_rodt_bprch");
> +	if (s)
> +		ctrl.s.rodt_bprch = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_int_zqcs_dis");
> +	if (s)
> +		ctrl.s.int_zqcs_dis = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_ext_zqcs_dis");
> +	if (s)
> +		ctrl.s.ext_zqcs_dis = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env_ull(priv, "ddr_control");
> +	if (s)
> +		ctrl.u64 = simple_strtoull(s, NULL, 0);
> +
> +	debug("LMC_CONTROL                                   : 0x%016llx\n",
> +	      ctrl.u64);
> +	lmc_wr(priv, CVMX_LMCX_CONTROL(if_num), ctrl.u64);
> +}
> +
> +static void lmc_timing_params0(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_timing_params0 tp0;
> +	unsigned int trp_value;
> +	char *s;
> +
> +	tp0.u64 = lmc_rd(priv, CVMX_LMCX_TIMING_PARAMS0(if_num));
> +
> +	trp_value = divide_roundup(trp, tclk_psecs) - 1;
> +	debug("TIMING_PARAMS0[TRP]: NEW 0x%x, OLD 0x%x\n", trp_value,
> +	      trp_value +
> +	      (unsigned int)(divide_roundup(max(4ull * tclk_psecs, 7500ull),
> +					    tclk_psecs)) - 4);
> +	s = lookup_env_ull(priv, "ddr_use_old_trp");
> +	if (s) {
> +		if (!!simple_strtoull(s, NULL, 0)) {
> +			trp_value +=
> +			    divide_roundup(max(4ull * tclk_psecs, 7500ull),
> +					   tclk_psecs) - 4;
> +			debug("TIMING_PARAMS0[trp]: USING OLD 0x%x\n",
> +			      trp_value);
> +		}
> +	}
> +
> +	tp0.cn78xx.txpr =
> +	    divide_roundup(max(5ull * tclk_psecs, trfc + 10000ull),
> +			   16 * tclk_psecs);
> +	tp0.cn78xx.trp = trp_value & 0x1f;
> +	tp0.cn78xx.tcksre =
> +	    divide_roundup(max(5ull * tclk_psecs, 10000ull), tclk_psecs) - 1;
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		int tzqinit = 4;	// Default to 4, for all DDR4 speed bins
> +
> +		s = lookup_env(priv, "ddr_tzqinit");
> +		if (s)
> +			tzqinit = simple_strtoul(s, NULL, 0);
> +
> +		tp0.cn78xx.tzqinit = tzqinit;
> +		/* Always 8. */
> +		tp0.cn78xx.tzqcs = divide_roundup(128 * tclk_psecs,
> +						  (16 * tclk_psecs));
> +		tp0.cn78xx.tcke =
> +		    divide_roundup(max(3 * tclk_psecs, (ulong)DDR3_TCKE),
> +				   tclk_psecs) - 1;
> +		tp0.cn78xx.tmrd =
> +		    divide_roundup((DDR4_TMRD * tclk_psecs), tclk_psecs) - 1;
> +		tp0.cn78xx.tmod = 25;	/* 25 is the max allowed */
> +		tp0.cn78xx.tdllk = divide_roundup(DDR4_TDLLK, 256);
> +	} else {
> +		tp0.cn78xx.tzqinit =
> +		    divide_roundup(max(512ull * tclk_psecs, 640000ull),
> +				   (256 * tclk_psecs));
> +		tp0.cn78xx.tzqcs =
> +		    divide_roundup(max(64ull * tclk_psecs, DDR3_ZQCS),
> +				   (16 * tclk_psecs));
> +		tp0.cn78xx.tcke = divide_roundup(DDR3_TCKE, tclk_psecs) - 1;
> +		tp0.cn78xx.tmrd =
> +		    divide_roundup((DDR3_TMRD * tclk_psecs), tclk_psecs) - 1;
> +		tp0.cn78xx.tmod =
> +		    divide_roundup(max(12ull * tclk_psecs, 15000ull),
> +				   tclk_psecs) - 1;
> +		tp0.cn78xx.tdllk = divide_roundup(DDR3_TDLLK, 256);
> +	}
> +
> +	s = lookup_env_ull(priv, "ddr_timing_params0");
> +	if (s)
> +		tp0.u64 = simple_strtoull(s, NULL, 0);
> +	debug("TIMING_PARAMS0                                : 0x%016llx\n",
> +	      tp0.u64);
> +	lmc_wr(priv, CVMX_LMCX_TIMING_PARAMS0(if_num), tp0.u64);
> +}
> +
> +static void lmc_timing_params1(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_timing_params1 tp1;
> +	unsigned int txp, temp_trcd, trfc_dlr;
> +	char *s;
> +
> +	tp1.u64 = lmc_rd(priv, CVMX_LMCX_TIMING_PARAMS1(if_num));
> +
> +	/* .cn70xx. */
> +	tp1.s.tmprr = divide_roundup(DDR3_TMPRR * tclk_psecs, tclk_psecs) - 1;
> +
> +	tp1.cn78xx.tras = divide_roundup(tras, tclk_psecs) - 1;
> +
> +	temp_trcd = divide_roundup(trcd, tclk_psecs);
> +	if (temp_trcd > 15) {
> +		debug("TIMING_PARAMS1[trcd]: need extension bit for 0x%x\n",
> +		      temp_trcd);
> +	}
> +	if (octeon_is_cpuid(OCTEON_CN78XX_PASS1_X) && temp_trcd > 15) {
> +		/*
> +		 * Let .trcd=0 serve as a flag that the field has
> +		 * overflowed. Must use Additive Latency mode as a
> +		 * workaround.
> +		 */
> +		temp_trcd = 0;
> +	}
> +	tp1.cn78xx.trcd = (temp_trcd >> 0) & 0xf;
> +	tp1.cn78xx.trcd_ext = (temp_trcd >> 4) & 0x1;
> +
> +	tp1.cn78xx.twtr = divide_roundup(twtr, tclk_psecs) - 1;
> +	tp1.cn78xx.trfc = divide_roundup(trfc, 8 * tclk_psecs);
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		/* Workaround bug 24006. Use Trrd_l. */
> +		tp1.cn78xx.trrd =
> +		    divide_roundup(ddr4_trrd_lmin, tclk_psecs) - 2;
> +	} else {
> +		tp1.cn78xx.trrd = divide_roundup(trrd, tclk_psecs) - 2;
> +	}
> +
> +	/*
> +	 * tXP = max( 3nCK, 7.5 ns)     DDR3-800   tCLK = 2500 psec
> +	 * tXP = max( 3nCK, 7.5 ns)     DDR3-1066  tCLK = 1875 psec
> +	 * tXP = max( 3nCK, 6.0 ns)     DDR3-1333  tCLK = 1500 psec
> +	 * tXP = max( 3nCK, 6.0 ns)     DDR3-1600  tCLK = 1250 psec
> +	 * tXP = max( 3nCK, 6.0 ns)     DDR3-1866  tCLK = 1071 psec
> +	 * tXP = max( 3nCK, 6.0 ns)     DDR3-2133  tCLK =  937 psec
> +	 */
> +	txp = (tclk_psecs < 1875) ? 6000 : 7500;
> +	txp = divide_roundup(max((unsigned int)(3 * tclk_psecs), txp),
> +			     tclk_psecs) - 1;
> +	if (txp > 7) {
> +		debug("TIMING_PARAMS1[txp]: need extension bit for 0x%x\n",
> +		      txp);
> +	}
> +	if (octeon_is_cpuid(OCTEON_CN78XX_PASS1_X) && txp > 7)
> +		txp = 7;	// max it out
> +	tp1.cn78xx.txp = (txp >> 0) & 7;
> +	tp1.cn78xx.txp_ext = (txp >> 3) & 1;
> +
> +	tp1.cn78xx.twlmrd = divide_roundup(DDR3_TWLMRD * tclk_psecs,
> +					   4 * tclk_psecs);
> +	tp1.cn78xx.twldqsen = divide_roundup(DDR3_TWLDQSEN * tclk_psecs,
> +					     4 * tclk_psecs);
> +	tp1.cn78xx.tfaw = divide_roundup(tfaw, 4 * tclk_psecs);
> +	tp1.cn78xx.txpdll = divide_roundup(max(10ull * tclk_psecs, 24000ull),
> +					   tclk_psecs) - 1;
> +
> +	if (ddr_type == DDR4_DRAM && is_3ds_dimm) {
> +		/*
> +		 * 4 Gb: tRFC_DLR = 90 ns
> +		 * 8 Gb: tRFC_DLR = 120 ns
> +		 * 16 Gb: tRFC_DLR = 190 ns FIXME?
> +		 */
> +		if (die_capacity == 0x1000)	// 4 Gbit
> +			trfc_dlr = 90;
> +		else if (die_capacity == 0x2000)	// 8 Gbit
> +			trfc_dlr = 120;
> +		else if (die_capacity == 0x4000)	// 16 Gbit
> +			trfc_dlr = 190;
> +		else
> +			trfc_dlr = 0;
> +
> +		if (trfc_dlr == 0) {
> +			debug("N%d.LMC%d: ERROR: tRFC_DLR: die_capacity %u Mbit is illegal\n",
> +			      node, if_num, die_capacity);
> +		} else {
> +			tp1.cn78xx.trfc_dlr =
> +			    divide_roundup(trfc_dlr * 1000UL, 8 * tclk_psecs);
> +			debug("N%d.LMC%d: TIMING_PARAMS1[trfc_dlr] set to %u\n",
> +			      node, if_num, tp1.cn78xx.trfc_dlr);
> +		}
> +	}
> +
> +	s = lookup_env_ull(priv, "ddr_timing_params1");
> +	if (s)
> +		tp1.u64 = simple_strtoull(s, NULL, 0);
> +
> +	debug("TIMING_PARAMS1                                : 0x%016llx\n",
> +	      tp1.u64);
> +	lmc_wr(priv, CVMX_LMCX_TIMING_PARAMS1(if_num), tp1.u64);
> +}
> +
> +static void lmc_timing_params2(struct ddr_priv *priv)
> +{
> +	if (ddr_type == DDR4_DRAM) {
> +		union cvmx_lmcx_timing_params1 tp1;
> +		union cvmx_lmcx_timing_params2 tp2;
> +		int temp_trrd_l;
> +
> +		tp1.u64 = lmc_rd(priv, CVMX_LMCX_TIMING_PARAMS1(if_num));
> +		tp2.u64 = lmc_rd(priv, CVMX_LMCX_TIMING_PARAMS2(if_num));
> +		debug("TIMING_PARAMS2                                : 0x%016llx\n",
> +		      tp2.u64);
> +
> +		temp_trrd_l = divide_roundup(ddr4_trrd_lmin, tclk_psecs) - 2;
> +		if (temp_trrd_l > 7)
> +			debug("TIMING_PARAMS2[trrd_l]: need extension bit for 0x%x\n",
> +			      temp_trrd_l);
> +		if (octeon_is_cpuid(OCTEON_CN78XX_PASS1_X) && temp_trrd_l > 7)
> +			temp_trrd_l = 7;	// max it out
> +		tp2.cn78xx.trrd_l = (temp_trrd_l >> 0) & 7;
> +		tp2.cn78xx.trrd_l_ext = (temp_trrd_l >> 3) & 1;
> +
> +		// correct for 1600-2400
> +		tp2.s.twtr_l = divide_nint(max(4ull * tclk_psecs, 7500ull),
> +					   tclk_psecs) - 1;
> +		tp2.s.t_rw_op_max = 7;
> +		tp2.s.trtp = divide_roundup(max(4ull * tclk_psecs, 7500ull),
> +					    tclk_psecs) - 1;
> +
> +		debug("TIMING_PARAMS2                                : 0x%016llx\n",
> +		      tp2.u64);
> +		lmc_wr(priv, CVMX_LMCX_TIMING_PARAMS2(if_num), tp2.u64);
> +
> +		/*
> +		 * Workaround Errata 25823 - LMC: Possible DDR4 tWTR_L not met
> +		 * for Write-to-Read operations to the same Bank Group
> +		 */
> +		if (tp1.cn78xx.twtr < (tp2.s.twtr_l - 4)) {
> +			tp1.cn78xx.twtr = tp2.s.twtr_l - 4;
> +			debug("ERRATA 25823: NEW: TWTR: %d, TWTR_L: %d\n",
> +			      tp1.cn78xx.twtr, tp2.s.twtr_l);
> +			debug("TIMING_PARAMS1                                : 0x%016llx\n",
> +			      tp1.u64);
> +			lmc_wr(priv, CVMX_LMCX_TIMING_PARAMS1(if_num), tp1.u64);
> +		}
> +	}
> +}
> +
> +static void lmc_modereg_params0(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_modereg_params0 mp0;
> +	int param;
> +	char *s;
> +
> +	mp0.u64 = lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num));
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		mp0.s.cwl = 0;	/* 1600 (1250ps) */
> +		if (tclk_psecs < 1250)
> +			mp0.s.cwl = 1;	/* 1866 (1072ps) */
> +		if (tclk_psecs < 1072)
> +			mp0.s.cwl = 2;	/* 2133 (938ps) */
> +		if (tclk_psecs < 938)
> +			mp0.s.cwl = 3;	/* 2400 (833ps) */
> +		if (tclk_psecs < 833)
> +			mp0.s.cwl = 4;	/* 2666 (750ps) */
> +		if (tclk_psecs < 750)
> +			mp0.s.cwl = 5;	/* 3200 (625ps) */
> +	} else {
> +		/*
> +		 ** CSR   CWL         CAS write Latency
> +		 ** ===   ===   =================================
> +		 **  0      5   (           tCK(avg) >=   2.5 ns)
> +		 **  1      6   (2.5 ns   > tCK(avg) >= 1.875 ns)
> +		 **  2      7   (1.875 ns > tCK(avg) >= 1.5   ns)
> +		 **  3      8   (1.5 ns   > tCK(avg) >= 1.25  ns)
> +		 **  4      9   (1.25 ns  > tCK(avg) >= 1.07  ns)
> +		 **  5     10   (1.07 ns  > tCK(avg) >= 0.935 ns)
> +		 **  6     11   (0.935 ns > tCK(avg) >= 0.833 ns)
> +		 **  7     12   (0.833 ns > tCK(avg) >= 0.75  ns)
> +		 */
> +
> +		mp0.s.cwl = 0;
> +		if (tclk_psecs < 2500)
> +			mp0.s.cwl = 1;
> +		if (tclk_psecs < 1875)
> +			mp0.s.cwl = 2;
> +		if (tclk_psecs < 1500)
> +			mp0.s.cwl = 3;
> +		if (tclk_psecs < 1250)
> +			mp0.s.cwl = 4;
> +		if (tclk_psecs < 1070)
> +			mp0.s.cwl = 5;
> +		if (tclk_psecs < 935)
> +			mp0.s.cwl = 6;
> +		if (tclk_psecs < 833)
> +			mp0.s.cwl = 7;
> +	}
> +
> +	s = lookup_env(priv, "ddr_cwl");
> +	if (s)
> +		mp0.s.cwl = simple_strtoul(s, NULL, 0) - 5;
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		debug("%-45s : %d, [0x%x]\n", "CAS Write Latency CWL, [CSR]",
> +		      mp0.s.cwl + 9
> +		      + ((mp0.s.cwl > 2) ? (mp0.s.cwl - 3) * 2 : 0), mp0.s.cwl);
> +	} else {
> +		debug("%-45s : %d, [0x%x]\n", "CAS Write Latency CWL, [CSR]",
> +		      mp0.s.cwl + 5, mp0.s.cwl);
> +	}
> +
> +	mp0.s.mprloc = 0;
> +	mp0.s.mpr = 0;
> +	mp0.s.dll = (ddr_type == DDR4_DRAM);	/* 0 for DDR3 and 1 for DDR4 */
> +	mp0.s.al = 0;
> +	mp0.s.wlev = 0;		/* Read Only */
> +	if (octeon_is_cpuid(OCTEON_CN70XX) || ddr_type == DDR4_DRAM)
> +		mp0.s.tdqs = 0;
> +	else
> +		mp0.s.tdqs = 1;
> +	mp0.s.qoff = 0;
> +
> +	s = lookup_env(priv, "ddr_cl");
> +	if (s) {
> +		cl = simple_strtoul(s, NULL, 0);
> +		debug("CAS Latency                                   : %6d\n",
> +		      cl);
> +	}
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		mp0.s.cl = 0x0;
> +		if (cl > 9)
> +			mp0.s.cl = 0x1;
> +		if (cl > 10)
> +			mp0.s.cl = 0x2;
> +		if (cl > 11)
> +			mp0.s.cl = 0x3;
> +		if (cl > 12)
> +			mp0.s.cl = 0x4;
> +		if (cl > 13)
> +			mp0.s.cl = 0x5;
> +		if (cl > 14)
> +			mp0.s.cl = 0x6;
> +		if (cl > 15)
> +			mp0.s.cl = 0x7;
> +		if (cl > 16)
> +			mp0.s.cl = 0x8;
> +		if (cl > 18)
> +			mp0.s.cl = 0x9;
> +		if (cl > 20)
> +			mp0.s.cl = 0xA;
> +		if (cl > 24)
> +			mp0.s.cl = 0xB;
> +	} else {
> +		mp0.s.cl = 0x2;
> +		if (cl > 5)
> +			mp0.s.cl = 0x4;
> +		if (cl > 6)
> +			mp0.s.cl = 0x6;
> +		if (cl > 7)
> +			mp0.s.cl = 0x8;
> +		if (cl > 8)
> +			mp0.s.cl = 0xA;
> +		if (cl > 9)
> +			mp0.s.cl = 0xC;
> +		if (cl > 10)
> +			mp0.s.cl = 0xE;
> +		if (cl > 11)
> +			mp0.s.cl = 0x1;
> +		if (cl > 12)
> +			mp0.s.cl = 0x3;
> +		if (cl > 13)
> +			mp0.s.cl = 0x5;
> +		if (cl > 14)
> +			mp0.s.cl = 0x7;
> +		if (cl > 15)
> +			mp0.s.cl = 0x9;
> +	}
> +
> +	mp0.s.rbt = 0;		/* Read Only. */
> +	mp0.s.tm = 0;
> +	mp0.s.dllr = 0;
> +
> +	param = divide_roundup(twr, tclk_psecs);
> +
> +	if (ddr_type == DDR4_DRAM) {	/* DDR4 */
> +		mp0.s.wrp = 1;
> +		if (param > 12)
> +			mp0.s.wrp = 2;
> +		if (param > 14)
> +			mp0.s.wrp = 3;
> +		if (param > 16)
> +			mp0.s.wrp = 4;
> +		if (param > 18)
> +			mp0.s.wrp = 5;
> +		if (param > 20)
> +			mp0.s.wrp = 6;
> +		if (param > 24)	/* RESERVED in DDR4 spec */
> +			mp0.s.wrp = 7;
> +	} else {		/* DDR3 */
> +		mp0.s.wrp = 1;
> +		if (param > 5)
> +			mp0.s.wrp = 2;
> +		if (param > 6)
> +			mp0.s.wrp = 3;
> +		if (param > 7)
> +			mp0.s.wrp = 4;
> +		if (param > 8)
> +			mp0.s.wrp = 5;
> +		if (param > 10)
> +			mp0.s.wrp = 6;
> +		if (param > 12)
> +			mp0.s.wrp = 7;
> +	}
> +
> +	mp0.s.ppd = 0;
> +
> +	s = lookup_env(priv, "ddr_wrp");
> +	if (s)
> +		mp0.s.wrp = simple_strtoul(s, NULL, 0);
> +
> +	debug("%-45s : %d, [0x%x]\n",
> +	      "Write recovery for auto precharge WRP, [CSR]", param, mp0.s.wrp);
> +
> +	s = lookup_env_ull(priv, "ddr_modereg_params0");
> +	if (s)
> +		mp0.u64 = simple_strtoull(s, NULL, 0);
> +
> +	debug("MODEREG_PARAMS0                               : 0x%016llx\n",
> +	      mp0.u64);
> +	lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num), mp0.u64);
> +}
> +
> +static void lmc_modereg_params1(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_modereg_params1 mp1;
> +	char *s;
> +	int i;
> +
> +	mp1.u64 = odt_config[odt_idx].modereg_params1.u64;
> +
> +#ifdef CAVIUM_ONLY
> +	/*
> +	 * Special request: mismatched DIMM support. Slot 0: 2-Rank,
> +	 * Slot 1: 1-Rank
> +	 */
> +	if (rank_mask == 0x7) {	/* 2-Rank, 1-Rank */
> +		mp1.s.rtt_nom_00 = 0;
> +		mp1.s.rtt_nom_01 = 3;	/* rttnom_40ohm */
> +		mp1.s.rtt_nom_10 = 3;	/* rttnom_40ohm */
> +		mp1.s.rtt_nom_11 = 0;
> +		dyn_rtt_nom_mask = 0x6;
> +	}
> +#endif /* CAVIUM_ONLY */
> +
> +	s = lookup_env(priv, "ddr_rtt_nom_mask");
> +	if (s)
> +		dyn_rtt_nom_mask = simple_strtoul(s, NULL, 0);
> +
> +	/*
> +	 * Save the original rtt_nom settings before sweeping through
> +	 * settings.
> +	 */
> +	default_rtt_nom[0] = mp1.s.rtt_nom_00;
> +	default_rtt_nom[1] = mp1.s.rtt_nom_01;
> +	default_rtt_nom[2] = mp1.s.rtt_nom_10;
> +	default_rtt_nom[3] = mp1.s.rtt_nom_11;
> +
> +	ddr_rtt_nom_auto = c_cfg->ddr_rtt_nom_auto;
> +
> +	for (i = 0; i < 4; ++i) {
> +		u64 value;
> +
> +		s = lookup_env(priv, "ddr_rtt_nom_%1d%1d", !!(i & 2),
> +			       !!(i & 1));
> +		if (!s)
> +			s = lookup_env(priv, "ddr%d_rtt_nom_%1d%1d", if_num,
> +				       !!(i & 2), !!(i & 1));
> +		if (s) {
> +			value = simple_strtoul(s, NULL, 0);
> +			mp1.u64 &= ~((u64)0x7 << (i * 12 + 9));
> +			mp1.u64 |= ((value & 0x7) << (i * 12 + 9));
> +			default_rtt_nom[i] = value;
> +			ddr_rtt_nom_auto = 0;
> +		}
> +	}
> +
> +	s = lookup_env(priv, "ddr_rtt_nom");
> +	if (!s)
> +		s = lookup_env(priv, "ddr%d_rtt_nom", if_num);
> +	if (s) {
> +		u64 value;
> +
> +		value = simple_strtoul(s, NULL, 0);
> +
> +		if (dyn_rtt_nom_mask & 1) {
> +			default_rtt_nom[0] = value;
> +			mp1.s.rtt_nom_00 = value;
> +		}
> +		if (dyn_rtt_nom_mask & 2) {
> +			default_rtt_nom[1] = value;
> +			mp1.s.rtt_nom_01 = value;
> +		}
> +		if (dyn_rtt_nom_mask & 4) {
> +			default_rtt_nom[2] = value;
> +			mp1.s.rtt_nom_10 = value;
> +		}
> +		if (dyn_rtt_nom_mask & 8) {
> +			default_rtt_nom[3] = value;
> +			mp1.s.rtt_nom_11 = value;
> +		}
> +
> +		ddr_rtt_nom_auto = 0;
> +	}
> +
> +	for (i = 0; i < 4; ++i) {
> +		u64 value;
> +
> +		s = lookup_env(priv, "ddr_rtt_wr_%1d%1d", !!(i & 2), !!(i & 1));
> +		if (!s)
> +			s = lookup_env(priv, "ddr%d_rtt_wr_%1d%1d", if_num,
> +				       !!(i & 2), !!(i & 1));
> +		if (s) {
> +			value = simple_strtoul(s, NULL, 0);
> +			insrt_wr(&mp1.u64, i, value);
> +		}
> +	}
> +
> +	// Make sure 78XX pass 1 has valid RTT_WR settings, because
> +	// configuration files may be set-up for later chips, and
> +	// 78XX pass 1 supports no RTT_WR extension bits
> +	if (octeon_is_cpuid(OCTEON_CN78XX_PASS1_X)) {
> +		for (i = 0; i < 4; ++i) {
> +			// if 80 or undefined
> +			if (extr_wr(mp1.u64, i) > 3) {
> +				// FIXME? always insert 120
> +				insrt_wr(&mp1.u64, i, 1);
> +				debug("RTT_WR_%d%d set to 120 for CN78XX pass 1\n",
> +				      !!(i & 2), i & 1);
> +			}
> +		}
> +	}
> +
> +	s = lookup_env(priv, "ddr_dic");
> +	if (s) {
> +		u64 value = simple_strtoul(s, NULL, 0);
> +
> +		for (i = 0; i < 4; ++i) {
> +			mp1.u64 &= ~((u64)0x3 << (i * 12 + 7));
> +			mp1.u64 |= ((value & 0x3) << (i * 12 + 7));
> +		}
> +	}
> +
> +	for (i = 0; i < 4; ++i) {
> +		u64 value;
> +
> +		s = lookup_env(priv, "ddr_dic_%1d%1d", !!(i & 2), !!(i & 1));
> +		if (s) {
> +			value = simple_strtoul(s, NULL, 0);
> +			mp1.u64 &= ~((u64)0x3 << (i * 12 + 7));
> +			mp1.u64 |= ((value & 0x3) << (i * 12 + 7));
> +		}
> +	}
> +
> +	s = lookup_env_ull(priv, "ddr_modereg_params1");
> +	if (s)
> +		mp1.u64 = simple_strtoull(s, NULL, 0);
> +
> +	debug("RTT_NOM     %3d, %3d, %3d, %3d ohms           :  %x,%x,%x,%x\n",
> +	      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_11],
> +	      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_10],
> +	      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_01],
> +	      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_00],
> +	      mp1.s.rtt_nom_11,
> +	      mp1.s.rtt_nom_10, mp1.s.rtt_nom_01, mp1.s.rtt_nom_00);
> +
> +	debug("RTT_WR      %3d, %3d, %3d, %3d ohms           :  %x,%x,%x,%x\n",
> +	      imp_val->rtt_wr_ohms[extr_wr(mp1.u64, 3)],
> +	      imp_val->rtt_wr_ohms[extr_wr(mp1.u64, 2)],
> +	      imp_val->rtt_wr_ohms[extr_wr(mp1.u64, 1)],
> +	      imp_val->rtt_wr_ohms[extr_wr(mp1.u64, 0)],
> +	      extr_wr(mp1.u64, 3),
> +	      extr_wr(mp1.u64, 2), extr_wr(mp1.u64, 1), extr_wr(mp1.u64, 0));
> +
> +	debug("DIC         %3d, %3d, %3d, %3d ohms           :  %x,%x,%x,%x\n",
> +	      imp_val->dic_ohms[mp1.s.dic_11],
> +	      imp_val->dic_ohms[mp1.s.dic_10],
> +	      imp_val->dic_ohms[mp1.s.dic_01],
> +	      imp_val->dic_ohms[mp1.s.dic_00],
> +	      mp1.s.dic_11, mp1.s.dic_10, mp1.s.dic_01, mp1.s.dic_00);
> +
> +	debug("MODEREG_PARAMS1                               : 0x%016llx\n",
> +	      mp1.u64);
> +	lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS1(if_num), mp1.u64);
> +}
> +
> +static void lmc_modereg_params2(struct ddr_priv *priv)
> +{
> +	char *s;
> +	int i;
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		union cvmx_lmcx_modereg_params2 mp2;
> +
> +		mp2.u64 = odt_config[odt_idx].modereg_params2.u64;
> +
> +		s = lookup_env(priv, "ddr_rtt_park");
> +		if (s) {
> +			u64 value = simple_strtoul(s, NULL, 0);
> +
> +			for (i = 0; i < 4; ++i) {
> +				mp2.u64 &= ~((u64)0x7 << (i * 10 + 0));
> +				mp2.u64 |= ((value & 0x7) << (i * 10 + 0));
> +			}
> +		}
> +
> +		for (i = 0; i < 4; ++i) {
> +			u64 value;
> +
> +			s = lookup_env(priv, "ddr_rtt_park_%1d%1d", !!(i & 2),
> +				       !!(i & 1));
> +			if (s) {
> +				value = simple_strtoul(s, NULL, 0);
> +				mp2.u64 &= ~((u64)0x7 << (i * 10 + 0));
> +				mp2.u64 |= ((value & 0x7) << (i * 10 + 0));
> +			}
> +		}
> +
> +		s = lookup_env_ull(priv, "ddr_modereg_params2");
> +		if (s)
> +			mp2.u64 = simple_strtoull(s, NULL, 0);
> +
> +		debug("RTT_PARK    %3d, %3d, %3d, %3d ohms           :  %x,%x,%x,%x\n",
> +		      imp_val->rtt_nom_ohms[mp2.s.rtt_park_11],
> +		      imp_val->rtt_nom_ohms[mp2.s.rtt_park_10],
> +		      imp_val->rtt_nom_ohms[mp2.s.rtt_park_01],
> +		      imp_val->rtt_nom_ohms[mp2.s.rtt_park_00],
> +		      mp2.s.rtt_park_11, mp2.s.rtt_park_10, mp2.s.rtt_park_01,
> +		      mp2.s.rtt_park_00);
> +
> +		debug("%-45s :  0x%x,0x%x,0x%x,0x%x\n", "VREF_RANGE",
> +		      mp2.s.vref_range_11,
> +		      mp2.s.vref_range_10,
> +		      mp2.s.vref_range_01, mp2.s.vref_range_00);
> +
> +		debug("%-45s :  0x%x,0x%x,0x%x,0x%x\n", "VREF_VALUE",
> +		      mp2.s.vref_value_11,
> +		      mp2.s.vref_value_10,
> +		      mp2.s.vref_value_01, mp2.s.vref_value_00);
> +
> +		debug("MODEREG_PARAMS2                               : 0x%016llx\n",
> +		      mp2.u64);
> +		lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS2(if_num), mp2.u64);
> +	}
> +}
> +
> +static void lmc_modereg_params3(struct ddr_priv *priv)
> +{
> +	char *s;
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		union cvmx_lmcx_modereg_params3 mp3;
> +
> +		mp3.u64 = lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS3(if_num));
> +		/* Disable as workaround to Errata 20547 */
> +		mp3.s.rd_dbi = 0;
> +		mp3.s.tccd_l = max(divide_roundup(ddr4_tccd_lmin, tclk_psecs),
> +				   5ull) - 4;
> +
> +		s = lookup_env(priv, "ddr_rd_preamble");
> +		if (s)
> +			mp3.s.rd_preamble = !!simple_strtoul(s, NULL, 0);
> +
> +		if (!octeon_is_cpuid(OCTEON_CN78XX_PASS1_X)) {
> +			int delay = 0;
> +
> +			if (lranks_per_prank == 4 && ddr_hertz >= 1000000000)
> +				delay = 1;
> +
> +			mp3.s.xrank_add_tccd_l = delay;
> +			mp3.s.xrank_add_tccd_s = delay;
> +		}
> +
> +		lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS3(if_num), mp3.u64);
> +		debug("MODEREG_PARAMS3                               : 0x%016llx\n",
> +		      mp3.u64);
> +	}
> +}
> +
> +static void lmc_nxm(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_nxm lmc_nxm;
> +	int num_bits = row_lsb + row_bits + lranks_bits - 26;
> +	char *s;
> +
> +	lmc_nxm.u64 = lmc_rd(priv, CVMX_LMCX_NXM(if_num));
> +
> +	/* .cn78xx. */
> +	if (rank_mask & 0x1)
> +		lmc_nxm.cn78xx.mem_msb_d0_r0 = num_bits;
> +	if (rank_mask & 0x2)
> +		lmc_nxm.cn78xx.mem_msb_d0_r1 = num_bits;
> +	if (rank_mask & 0x4)
> +		lmc_nxm.cn78xx.mem_msb_d1_r0 = num_bits;
> +	if (rank_mask & 0x8)
> +		lmc_nxm.cn78xx.mem_msb_d1_r1 = num_bits;
> +
> +	/* Set the mask for non-existent ranks. */
> +	lmc_nxm.cn78xx.cs_mask = ~rank_mask & 0xff;
> +
> +	s = lookup_env_ull(priv, "ddr_nxm");
> +	if (s)
> +		lmc_nxm.u64 = simple_strtoull(s, NULL, 0);
> +
> +	debug("LMC_NXM                                       : 0x%016llx\n",
> +	      lmc_nxm.u64);
> +	lmc_wr(priv, CVMX_LMCX_NXM(if_num), lmc_nxm.u64);
> +}
> +
> +static void lmc_wodt_mask(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_wodt_mask wodt_mask;
> +	char *s;
> +
> +	wodt_mask.u64 = odt_config[odt_idx].odt_mask;
> +
> +	s = lookup_env_ull(priv, "ddr_wodt_mask");
> +	if (s)
> +		wodt_mask.u64 = simple_strtoull(s, NULL, 0);
> +
> +	debug("WODT_MASK                                     : 0x%016llx\n",
> +	      wodt_mask.u64);
> +	lmc_wr(priv, CVMX_LMCX_WODT_MASK(if_num), wodt_mask.u64);
> +}
> +
> +static void lmc_rodt_mask(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_rodt_mask rodt_mask;
> +	int rankx;
> +	char *s;
> +
> +	rodt_mask.u64 = odt_config[odt_idx].rodt_ctl;
> +
> +	s = lookup_env_ull(priv, "ddr_rodt_mask");
> +	if (s)
> +		rodt_mask.u64 = simple_strtoull(s, NULL, 0);
> +
> +	debug("%-45s : 0x%016llx\n", "RODT_MASK", rodt_mask.u64);
> +	lmc_wr(priv, CVMX_LMCX_RODT_MASK(if_num), rodt_mask.u64);
> +
> +	dyn_rtt_nom_mask = 0;
> +	for (rankx = 0; rankx < dimm_count * 4; rankx++) {
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +		dyn_rtt_nom_mask |= ((rodt_mask.u64 >> (8 * rankx)) & 0xff);
> +	}
> +	if (num_ranks == 4) {
> +		/*
> +		 * Normally ODT1 is wired to rank 1. For quad-ranked DIMMs
> +		 * ODT1 is wired to the third rank (rank 2).  The mask,
> +		 * dyn_rtt_nom_mask, is used to indicate for which ranks
> +		 * to sweep RTT_NOM during read-leveling. Shift the bit
> +		 * from the ODT1 position over to the "ODT2" position so
> +		 * that the read-leveling analysis comes out right.
> +		 */
> +		int odt1_bit = dyn_rtt_nom_mask & 2;
> +
> +		dyn_rtt_nom_mask &= ~2;
> +		dyn_rtt_nom_mask |= odt1_bit << 1;
> +	}
> +	debug("%-45s : 0x%02x\n", "DYN_RTT_NOM_MASK", dyn_rtt_nom_mask);
> +}
> +
> +static void lmc_comp_ctl2(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_comp_ctl2 cc2;
> +	char *s;
> +
> +	cc2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +
> +	cc2.cn78xx.dqx_ctl = odt_config[odt_idx].odt_ena;
> +	/* Default 4=34.3 ohm */
> +	cc2.cn78xx.ck_ctl = (c_cfg->ck_ctl == 0) ? 4 : c_cfg->ck_ctl;
> +	/* Default 4=34.3 ohm */
> +	cc2.cn78xx.cmd_ctl = (c_cfg->cmd_ctl == 0) ? 4 : c_cfg->cmd_ctl;
> +	/* Default 4=34.3 ohm */
> +	cc2.cn78xx.control_ctl = (c_cfg->ctl_ctl == 0) ? 4 : c_cfg->ctl_ctl;
> +
> +	ddr_rodt_ctl_auto = c_cfg->ddr_rodt_ctl_auto;
> +	s = lookup_env(priv, "ddr_rodt_ctl_auto");
> +	if (s)
> +		ddr_rodt_ctl_auto = !!simple_strtoul(s, NULL, 0);
> +
> +	default_rodt_ctl = odt_config[odt_idx].qs_dic;
> +	s = lookup_env(priv, "ddr_rodt_ctl");
> +	if (!s)
> +		s = lookup_env(priv, "ddr%d_rodt_ctl", if_num);
> +	if (s) {
> +		default_rodt_ctl = simple_strtoul(s, NULL, 0);
> +		ddr_rodt_ctl_auto = 0;
> +	}
> +
> +	cc2.cn70xx.rodt_ctl = default_rodt_ctl;
> +
> +	// if DDR4, force CK_CTL to 26 ohms if it is currently 34 ohms,
> +	// and DCLK speed is 1 GHz or more...
> +	if (ddr_type == DDR4_DRAM && cc2.s.ck_ctl == ddr4_driver_34_ohm &&
> +	    ddr_hertz >= 1000000000) {
> +		// lowest for DDR4 is 26 ohms
> +		cc2.s.ck_ctl = ddr4_driver_26_ohm;
> +		debug("N%d.LMC%d: Forcing DDR4 COMP_CTL2[CK_CTL] to %d, %d ohms\n",
> +		      node, if_num, cc2.s.ck_ctl,
> +		      imp_val->drive_strength[cc2.s.ck_ctl]);
> +	}
> +
> +	// if DDR4, 2DPC, UDIMM, force CONTROL_CTL and CMD_CTL to 26 ohms,
> +	// if DCLK speed is 1 GHz or more...
> +	if (ddr_type == DDR4_DRAM && dimm_count == 2 &&
> +	    (spd_dimm_type == 2 || spd_dimm_type == 6) &&
> +	    ddr_hertz >= 1000000000) {
> +		// lowest for DDR4 is 26 ohms
> +		cc2.cn78xx.control_ctl = ddr4_driver_26_ohm;
> +		// lowest for DDR4 is 26 ohms
> +		cc2.cn78xx.cmd_ctl = ddr4_driver_26_ohm;
> +		debug("N%d.LMC%d: Forcing DDR4 COMP_CTL2[CONTROL_CTL,CMD_CTL] to %d, %d ohms\n",
> +		      node, if_num, ddr4_driver_26_ohm,
> +		      imp_val->drive_strength[ddr4_driver_26_ohm]);
> +	}
> +
> +	s = lookup_env(priv, "ddr_ck_ctl");
> +	if (s)
> +		cc2.cn78xx.ck_ctl = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_cmd_ctl");
> +	if (s)
> +		cc2.cn78xx.cmd_ctl = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_control_ctl");
> +	if (s)
> +		cc2.cn70xx.control_ctl = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_dqx_ctl");
> +	if (s)
> +		cc2.cn78xx.dqx_ctl = simple_strtoul(s, NULL, 0);
> +
> +	debug("%-45s : %d, %d ohms\n", "DQX_CTL           ", cc2.cn78xx.dqx_ctl,
> +	      imp_val->drive_strength[cc2.cn78xx.dqx_ctl]);
> +	debug("%-45s : %d, %d ohms\n", "CK_CTL            ", cc2.cn78xx.ck_ctl,
> +	      imp_val->drive_strength[cc2.cn78xx.ck_ctl]);
> +	debug("%-45s : %d, %d ohms\n", "CMD_CTL           ", cc2.cn78xx.cmd_ctl,
> +	      imp_val->drive_strength[cc2.cn78xx.cmd_ctl]);
> +	debug("%-45s : %d, %d ohms\n", "CONTROL_CTL       ",
> +	      cc2.cn78xx.control_ctl,
> +	      imp_val->drive_strength[cc2.cn78xx.control_ctl]);
> +	debug("Read ODT_CTL                                  : 0x%x (%d ohms)\n",
> +	      cc2.cn78xx.rodt_ctl, imp_val->rodt_ohms[cc2.cn78xx.rodt_ctl]);
> +
> +	debug("%-45s : 0x%016llx\n", "COMP_CTL2", cc2.u64);
> +	lmc_wr(priv, CVMX_LMCX_COMP_CTL2(if_num), cc2.u64);
> +}
> +
> +static void lmc_phy_ctl(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_phy_ctl phy_ctl;
> +
> +	phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +	phy_ctl.s.ts_stagger = 0;
> +	// FIXME: are there others TBD?
> +	phy_ctl.s.dsk_dbg_overwrt_ena = 0;
> +
> +	if (!octeon_is_cpuid(OCTEON_CN78XX_PASS1_X) && lranks_per_prank > 1) {
> +		// C0 is TEN, C1 is A17
> +		phy_ctl.s.c0_sel = 2;
> +		phy_ctl.s.c1_sel = 2;
> +		debug("N%d.LMC%d: 3DS: setting PHY_CTL[cx_csel] = %d\n",
> +		      node, if_num, phy_ctl.s.c1_sel);
> +	}
> +
> +	debug("PHY_CTL                                       : 0x%016llx\n",
> +	      phy_ctl.u64);
> +	lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +}
> +
> +static void lmc_ext_config(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_ext_config ext_cfg;
> +	char *s;
> +
> +	ext_cfg.u64 = lmc_rd(priv, CVMX_LMCX_EXT_CONFIG(if_num));
> +	ext_cfg.s.vrefint_seq_deskew = 0;
> +	ext_cfg.s.read_ena_bprch = 1;
> +	ext_cfg.s.read_ena_fprch = 1;
> +	ext_cfg.s.drive_ena_fprch = 1;
> +	ext_cfg.s.drive_ena_bprch = 1;
> +	// make sure this is OFF for all current chips
> +	ext_cfg.s.invert_data = 0;
> +
> +	s = lookup_env(priv, "ddr_read_fprch");
> +	if (s)
> +		ext_cfg.s.read_ena_fprch = strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_read_bprch");
> +	if (s)
> +		ext_cfg.s.read_ena_bprch = strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_drive_fprch");
> +	if (s)
> +		ext_cfg.s.drive_ena_fprch = strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_drive_bprch");
> +	if (s)
> +		ext_cfg.s.drive_ena_bprch = strtoul(s, NULL, 0);
> +
> +	if (!octeon_is_cpuid(OCTEON_CN78XX_PASS1_X) && lranks_per_prank > 1) {
> +		ext_cfg.s.dimm0_cid = lranks_bits;
> +		ext_cfg.s.dimm1_cid = lranks_bits;
> +		debug("N%d.LMC%d: 3DS: setting EXT_CONFIG[dimmx_cid] = %d\n",
> +		      node, if_num, ext_cfg.s.dimm0_cid);
> +	}
> +
> +	lmc_wr(priv, CVMX_LMCX_EXT_CONFIG(if_num), ext_cfg.u64);
> +	debug("%-45s : 0x%016llx\n", "EXT_CONFIG", ext_cfg.u64);
> +}
> +
> +static void lmc_ext_config2(struct ddr_priv *priv)
> +{
> +	char *s;
> +
> +	// NOTE: all chips have this register, but not necessarily the
> +	// fields we modify...
> +	if (!octeon_is_cpuid(OCTEON_CN78XX_PASS1_X) &&
> +	    !octeon_is_cpuid(OCTEON_CN73XX)) {
> +		union cvmx_lmcx_ext_config2 ext_cfg2;
> +		int value = 1;	// default to 1
> +
> +		ext_cfg2.u64 = lmc_rd(priv, CVMX_LMCX_EXT_CONFIG2(if_num));
> +
> +		s = lookup_env(priv, "ddr_ext2_delay_unload");
> +		if (s)
> +			value = !!simple_strtoul(s, NULL, 0);
> +
> +		ext_cfg2.s.delay_unload_r0 = value;
> +		ext_cfg2.s.delay_unload_r1 = value;
> +		ext_cfg2.s.delay_unload_r2 = value;
> +		ext_cfg2.s.delay_unload_r3 = value;
> +
> +		lmc_wr(priv, CVMX_LMCX_EXT_CONFIG2(if_num), ext_cfg2.u64);
> +		debug("%-45s : 0x%016llx\n", "EXT_CONFIG2", ext_cfg2.u64);
> +	}
> +}
> +
> +static void lmc_dimm01_params_loop(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_dimmx_params dimm_p;
> +	int dimmx = didx;
> +	char *s;
> +	int rc;
> +	int i;
> +
> +	dimm_p.u64 = lmc_rd(priv, CVMX_LMCX_DIMMX_PARAMS(dimmx, if_num));
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		union cvmx_lmcx_dimmx_ddr4_params0 ddr4_p0;
> +		union cvmx_lmcx_dimmx_ddr4_params1 ddr4_p1;
> +		union cvmx_lmcx_ddr4_dimm_ctl ddr4_ctl;
> +
> +		dimm_p.s.rc0 = 0;
> +		dimm_p.s.rc1 = 0;
> +		dimm_p.s.rc2 = 0;
> +
> +		rc = read_spd(&dimm_config_table[didx], 0,
> +			      DDR4_SPD_RDIMM_REGISTER_DRIVE_STRENGTH_CTL);
> +		dimm_p.s.rc3 = (rc >> 4) & 0xf;
> +		dimm_p.s.rc4 = ((rc >> 0) & 0x3) << 2;
> +		dimm_p.s.rc4 |= ((rc >> 2) & 0x3) << 0;
> +
> +		rc = read_spd(&dimm_config_table[didx], 0,
> +			      DDR4_SPD_RDIMM_REGISTER_DRIVE_STRENGTH_CK);
> +		dimm_p.s.rc5 = ((rc >> 0) & 0x3) << 2;
> +		dimm_p.s.rc5 |= ((rc >> 2) & 0x3) << 0;
> +
> +		dimm_p.s.rc6 = 0;
> +		dimm_p.s.rc7 = 0;
> +		dimm_p.s.rc8 = 0;
> +		dimm_p.s.rc9 = 0;
> +
> +		/*
> +		 * rc10               DDR4 RDIMM Operating Speed
> +		 * ===  ===================================================
> +		 *  0               tclk_psecs >= 1250 psec DDR4-1600 (1250 ps)
> +		 *  1   1250 psec > tclk_psecs >= 1071 psec DDR4-1866 (1071 ps)
> +		 *  2   1071 psec > tclk_psecs >=  938 psec DDR4-2133 ( 938 ps)
> +		 *  3    938 psec > tclk_psecs >=  833 psec DDR4-2400 ( 833 ps)
> +		 *  4    833 psec > tclk_psecs >=  750 psec DDR4-2666 ( 750 ps)
> +		 *  5    750 psec > tclk_psecs >=  625 psec DDR4-3200 ( 625 ps)
> +		 */
> +		dimm_p.s.rc10 = 0;
> +		if (tclk_psecs < 1250)
> +			dimm_p.s.rc10 = 1;
> +		if (tclk_psecs < 1071)
> +			dimm_p.s.rc10 = 2;
> +		if (tclk_psecs < 938)
> +			dimm_p.s.rc10 = 3;
> +		if (tclk_psecs < 833)
> +			dimm_p.s.rc10 = 4;
> +		if (tclk_psecs < 750)
> +			dimm_p.s.rc10 = 5;
> +
> +		dimm_p.s.rc11 = 0;
> +		dimm_p.s.rc12 = 0;
> +		/* 0=LRDIMM, 1=RDIMM */
> +		dimm_p.s.rc13 = (spd_dimm_type == 4) ? 0 : 4;
> +		dimm_p.s.rc13 |= (ddr_type == DDR4_DRAM) ?
> +			(spd_addr_mirror << 3) : 0;
> +		dimm_p.s.rc14 = 0;
> +		dimm_p.s.rc15 = 0;	/* 1 nCK latency adder */
> +
> +		ddr4_p0.u64 = 0;
> +
> +		ddr4_p0.s.rc8x = 0;
> +		ddr4_p0.s.rc7x = 0;
> +		ddr4_p0.s.rc6x = 0;
> +		ddr4_p0.s.rc5x = 0;
> +		ddr4_p0.s.rc4x = 0;
> +
> +		ddr4_p0.s.rc3x = compute_rc3x(tclk_psecs);
> +
> +		ddr4_p0.s.rc2x = 0;
> +		ddr4_p0.s.rc1x = 0;
> +
> +		ddr4_p1.u64 = 0;
> +
> +		ddr4_p1.s.rcbx = 0;
> +		ddr4_p1.s.rcax = 0;
> +		ddr4_p1.s.rc9x = 0;
> +
> +		ddr4_ctl.u64 = 0;
> +		ddr4_ctl.cn70xx.ddr4_dimm0_wmask = 0x004;
> +		ddr4_ctl.cn70xx.ddr4_dimm1_wmask =
> +		    (dimm_count > 1) ? 0x004 : 0x0000;
> +
> +		/*
> +		 * Handle any overrides from envvars here...
> +		 */
> +		s = lookup_env(priv, "ddr_ddr4_params0");
> +		if (s)
> +			ddr4_p0.u64 = simple_strtoul(s, NULL, 0);
> +
> +		s = lookup_env(priv, "ddr_ddr4_params1");
> +		if (s)
> +			ddr4_p1.u64 = simple_strtoul(s, NULL, 0);
> +
> +		s = lookup_env(priv, "ddr_ddr4_dimm_ctl");
> +		if (s)
> +			ddr4_ctl.u64 = simple_strtoul(s, NULL, 0);
> +
> +		for (i = 0; i < 11; ++i) {
> +			u64 value;
> +
> +			s = lookup_env(priv, "ddr_ddr4_rc%1xx", i + 1);
> +			if (s) {
> +				value = simple_strtoul(s, NULL, 0);
> +				if (i < 8) {
> +					ddr4_p0.u64 &= ~((u64)0xff << (i * 8));
> +					ddr4_p0.u64 |= (value << (i * 8));
> +				} else {
> +					ddr4_p1.u64 &=
> +					    ~((u64)0xff << ((i - 8) * 8));
> +					ddr4_p1.u64 |= (value << ((i - 8) * 8));
> +				}
> +			}
> +		}
> +
> +		/*
> +		 * write the final CSR values
> +		 */
> +		lmc_wr(priv, CVMX_LMCX_DIMMX_DDR4_PARAMS0(dimmx, if_num),
> +		       ddr4_p0.u64);
> +
> +		lmc_wr(priv, CVMX_LMCX_DDR4_DIMM_CTL(if_num), ddr4_ctl.u64);
> +
> +		lmc_wr(priv, CVMX_LMCX_DIMMX_DDR4_PARAMS1(dimmx, if_num),
> +		       ddr4_p1.u64);
> +
> +		debug("DIMM%d Register Control Words        RCBx:RC1x : %x %x %x %x %x %x %x %x %x %x %x\n",
> +		      dimmx, ddr4_p1.s.rcbx, ddr4_p1.s.rcax,
> +		      ddr4_p1.s.rc9x, ddr4_p0.s.rc8x,
> +		      ddr4_p0.s.rc7x, ddr4_p0.s.rc6x,
> +		      ddr4_p0.s.rc5x, ddr4_p0.s.rc4x,
> +		      ddr4_p0.s.rc3x, ddr4_p0.s.rc2x, ddr4_p0.s.rc1x);
> +
> +	} else {
> +		rc = read_spd(&dimm_config_table[didx], 0, 69);
> +		dimm_p.s.rc0 = (rc >> 0) & 0xf;
> +		dimm_p.s.rc1 = (rc >> 4) & 0xf;
> +
> +		rc = read_spd(&dimm_config_table[didx], 0, 70);
> +		dimm_p.s.rc2 = (rc >> 0) & 0xf;
> +		dimm_p.s.rc3 = (rc >> 4) & 0xf;
> +
> +		rc = read_spd(&dimm_config_table[didx], 0, 71);
> +		dimm_p.s.rc4 = (rc >> 0) & 0xf;
> +		dimm_p.s.rc5 = (rc >> 4) & 0xf;
> +
> +		rc = read_spd(&dimm_config_table[didx], 0, 72);
> +		dimm_p.s.rc6 = (rc >> 0) & 0xf;
> +		dimm_p.s.rc7 = (rc >> 4) & 0xf;
> +
> +		rc = read_spd(&dimm_config_table[didx], 0, 73);
> +		dimm_p.s.rc8 = (rc >> 0) & 0xf;
> +		dimm_p.s.rc9 = (rc >> 4) & 0xf;
> +
> +		rc = read_spd(&dimm_config_table[didx], 0, 74);
> +		dimm_p.s.rc10 = (rc >> 0) & 0xf;
> +		dimm_p.s.rc11 = (rc >> 4) & 0xf;
> +
> +		rc = read_spd(&dimm_config_table[didx], 0, 75);
> +		dimm_p.s.rc12 = (rc >> 0) & 0xf;
> +		dimm_p.s.rc13 = (rc >> 4) & 0xf;
> +
> +		rc = read_spd(&dimm_config_table[didx], 0, 76);
> +		dimm_p.s.rc14 = (rc >> 0) & 0xf;
> +		dimm_p.s.rc15 = (rc >> 4) & 0xf;
> +
> +		s = ddr_getenv_debug(priv, "ddr_clk_drive");
> +		if (s) {
> +			if (strcmp(s, "light") == 0)
> +				dimm_p.s.rc5 = 0x0;	/* Light Drive */
> +			if (strcmp(s, "moderate") == 0)
> +				dimm_p.s.rc5 = 0x5;	/* Moderate Drive */
> +			if (strcmp(s, "strong") == 0)
> +				dimm_p.s.rc5 = 0xA;	/* Strong Drive */
> +			printf("Parameter found in environment. ddr_clk_drive = %s\n",
> +			       s);
> +		}
> +
> +		s = ddr_getenv_debug(priv, "ddr_cmd_drive");
> +		if (s) {
> +			if (strcmp(s, "light") == 0)
> +				dimm_p.s.rc3 = 0x0;	/* Light Drive */
> +			if (strcmp(s, "moderate") == 0)
> +				dimm_p.s.rc3 = 0x5;	/* Moderate Drive */
> +			if (strcmp(s, "strong") == 0)
> +				dimm_p.s.rc3 = 0xA;	/* Strong Drive */
> +			printf("Parameter found in environment. ddr_cmd_drive = %s\n",
> +			       s);
> +		}
> +
> +		s = ddr_getenv_debug(priv, "ddr_ctl_drive");
> +		if (s) {
> +			if (strcmp(s, "light") == 0)
> +				dimm_p.s.rc4 = 0x0;	/* Light Drive */
> +			if (strcmp(s, "moderate") == 0)
> +				dimm_p.s.rc4 = 0x5;	/* Moderate Drive */
> +			printf("Parameter found in environment. ddr_ctl_drive = %s\n",
> +			       s);
> +		}
> +
> +		/*
> +		 * rc10               DDR3 RDIMM Operating Speed
> +		 * ==   =====================================================
> +		 *  0               tclk_psecs >= 2500 psec DDR3/DDR3L-800 def
> +		 *  1   2500 psec > tclk_psecs >= 1875 psec DDR3/DDR3L-1066
> +		 *  2   1875 psec > tclk_psecs >= 1500 psec DDR3/DDR3L-1333
> +		 *  3   1500 psec > tclk_psecs >= 1250 psec DDR3/DDR3L-1600
> +		 *  4   1250 psec > tclk_psecs >= 1071 psec DDR3-1866
> +		 */
> +		dimm_p.s.rc10 = 0;
> +		if (tclk_psecs < 2500)
> +			dimm_p.s.rc10 = 1;
> +		if (tclk_psecs < 1875)
> +			dimm_p.s.rc10 = 2;
> +		if (tclk_psecs < 1500)
> +			dimm_p.s.rc10 = 3;
> +		if (tclk_psecs < 1250)
> +			dimm_p.s.rc10 = 4;
> +	}
> +
> +	s = lookup_env(priv, "ddr_dimmx_params", i);
> +	if (s)
> +		dimm_p.u64 = simple_strtoul(s, NULL, 0);
> +
> +	for (i = 0; i < 16; ++i) {
> +		u64 value;
> +
> +		s = lookup_env(priv, "ddr_rc%d", i);
> +		if (s) {
> +			value = simple_strtoul(s, NULL, 0);
> +			dimm_p.u64 &= ~((u64)0xf << (i * 4));
> +			dimm_p.u64 |= (value << (i * 4));
> +		}
> +	}
> +
> +	lmc_wr(priv, CVMX_LMCX_DIMMX_PARAMS(dimmx, if_num), dimm_p.u64);
> +
> +	debug("DIMM%d Register Control Words         RC15:RC0 : %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x %x\n",
> +	      dimmx, dimm_p.s.rc15, dimm_p.s.rc14, dimm_p.s.rc13,
> +	      dimm_p.s.rc12, dimm_p.s.rc11, dimm_p.s.rc10,
> +	      dimm_p.s.rc9, dimm_p.s.rc8, dimm_p.s.rc7,
> +	      dimm_p.s.rc6, dimm_p.s.rc5, dimm_p.s.rc4,
> +	      dimm_p.s.rc3, dimm_p.s.rc2, dimm_p.s.rc1, dimm_p.s.rc0);
> +
> +	// FIXME: recognize a DDR3 RDIMM with 4 ranks and 2 registers,
> +	// and treat it specially
> +	if (ddr_type == DDR3_DRAM && num_ranks == 4 &&
> +	    spd_rdimm_registers == 2 && dimmx == 0) {
> +		debug("DDR3: Copying DIMM0_PARAMS to DIMM1_PARAMS for pseudo-DIMM #1...\n");
> +		lmc_wr(priv, CVMX_LMCX_DIMMX_PARAMS(1, if_num), dimm_p.u64);
> +	}
> +}
> +
> +static void lmc_dimm01_params(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_dimm_ctl dimm_ctl;
> +	char *s;
> +
> +	if (spd_rdimm) {
> +		for (didx = 0; didx < (unsigned int)dimm_count; ++didx)
> +			lmc_dimm01_params_loop(priv);
> +
> +		if (ddr_type == DDR4_DRAM) {
> +			/* LMC0_DIMM_CTL */
> +			dimm_ctl.u64 = lmc_rd(priv, CVMX_LMCX_DIMM_CTL(if_num));
> +			dimm_ctl.s.dimm0_wmask = 0xdf3f;
> +			dimm_ctl.s.dimm1_wmask =
> +			    (dimm_count > 1) ? 0xdf3f : 0x0000;
> +			dimm_ctl.s.tcws = 0x4e0;
> +			dimm_ctl.s.parity = c_cfg->parity;
> +
> +			s = lookup_env(priv, "ddr_dimm0_wmask");
> +			if (s) {
> +				dimm_ctl.s.dimm0_wmask =
> +				    simple_strtoul(s, NULL, 0);
> +			}
> +
> +			s = lookup_env(priv, "ddr_dimm1_wmask");
> +			if (s) {
> +				dimm_ctl.s.dimm1_wmask =
> +				    simple_strtoul(s, NULL, 0);
> +			}
> +
> +			s = lookup_env(priv, "ddr_dimm_ctl_parity");
> +			if (s)
> +				dimm_ctl.s.parity = simple_strtoul(s, NULL, 0);
> +
> +			s = lookup_env(priv, "ddr_dimm_ctl_tcws");
> +			if (s)
> +				dimm_ctl.s.tcws = simple_strtoul(s, NULL, 0);
> +
> +			debug("LMC DIMM_CTL                                  : 0x%016llx\n",
> +			      dimm_ctl.u64);
> +			lmc_wr(priv, CVMX_LMCX_DIMM_CTL(if_num), dimm_ctl.u64);
> +
> +			/* Init RCW */
> +			oct3_ddr3_seq(priv, rank_mask, if_num, 0x7);
> +
> +			/* Write RC0D last */
> +			dimm_ctl.s.dimm0_wmask = 0x2000;
> +			dimm_ctl.s.dimm1_wmask = (dimm_count > 1) ?
> +				0x2000 : 0x0000;
> +			debug("LMC DIMM_CTL                                  : 0x%016llx\n",
> +			      dimm_ctl.u64);
> +			lmc_wr(priv, CVMX_LMCX_DIMM_CTL(if_num), dimm_ctl.u64);
> +
> +			/*
> +			 * Don't write any extended registers the second time
> +			 */
> +			lmc_wr(priv, CVMX_LMCX_DDR4_DIMM_CTL(if_num), 0);
> +
> +			/* Init RCW */
> +			oct3_ddr3_seq(priv, rank_mask, if_num, 0x7);
> +		} else {
> +			/* LMC0_DIMM_CTL */
> +			dimm_ctl.u64 = lmc_rd(priv, CVMX_LMCX_DIMM_CTL(if_num));
> +			dimm_ctl.s.dimm0_wmask = 0xffff;
> +			// FIXME: recognize a DDR3 RDIMM with 4 ranks and 2
> +			// registers, and treat it specially
> +			if (num_ranks == 4 && spd_rdimm_registers == 2) {
> +				debug("DDR3: Activating DIMM_CTL[dimm1_mask] bits...\n");
> +				dimm_ctl.s.dimm1_wmask = 0xffff;
> +			} else {
> +				dimm_ctl.s.dimm1_wmask =
> +				    (dimm_count > 1) ? 0xffff : 0x0000;
> +			}
> +			dimm_ctl.s.tcws = 0x4e0;
> +			dimm_ctl.s.parity = c_cfg->parity;
> +
> +			s = lookup_env(priv, "ddr_dimm0_wmask");
> +			if (s) {
> +				dimm_ctl.s.dimm0_wmask =
> +				    simple_strtoul(s, NULL, 0);
> +			}
> +
> +			s = lookup_env(priv, "ddr_dimm1_wmask");
> +			if (s) {
> +				dimm_ctl.s.dimm1_wmask =
> +				    simple_strtoul(s, NULL, 0);
> +			}
> +
> +			s = lookup_env(priv, "ddr_dimm_ctl_parity");
> +			if (s)
> +				dimm_ctl.s.parity = simple_strtoul(s, NULL, 0);
> +
> +			s = lookup_env(priv, "ddr_dimm_ctl_tcws");
> +			if (s)
> +				dimm_ctl.s.tcws = simple_strtoul(s, NULL, 0);
> +
> +			debug("LMC DIMM_CTL                                  : 0x%016llx\n",
> +			      dimm_ctl.u64);
> +			lmc_wr(priv, CVMX_LMCX_DIMM_CTL(if_num), dimm_ctl.u64);
> +
> +			/* Init RCW */
> +			oct3_ddr3_seq(priv, rank_mask, if_num, 0x7);
> +		}
> +
> +	} else {
> +		/* Disable register control writes for unbuffered */
> +		union cvmx_lmcx_dimm_ctl dimm_ctl;
> +
> +		dimm_ctl.u64 = lmc_rd(priv, CVMX_LMCX_DIMM_CTL(if_num));
> +		dimm_ctl.s.dimm0_wmask = 0;
> +		dimm_ctl.s.dimm1_wmask = 0;
> +		lmc_wr(priv, CVMX_LMCX_DIMM_CTL(if_num), dimm_ctl.u64);
> +	}
> +}
> +
> +static int lmc_rank_init(struct ddr_priv *priv)
> +{
> +	char *s;
> +
> +#if ALLOW_BY_RANK_INIT
> +	if (enable_by_rank_init) {
> +		by_rank = 3;
> +		saved_rank_mask = rank_mask;
> +	}
> +
> +start_by_rank_init:
> +
> +	if (enable_by_rank_init) {
> +		rank_mask = (1 << by_rank);
> +		if (!(rank_mask & saved_rank_mask))
> +			goto end_by_rank_init;
> +		if (by_rank == 0)
> +			rank_mask = saved_rank_mask;
> +
> +		debug("\n>>>>> BY_RANK: starting rank %d with mask 0x%02x\n\n",
> +		      by_rank, rank_mask);
> +	}
> +
> +#endif /* ALLOW_BY_RANK_INIT */
> +
> +	/*
> +	 * Comments (steps 3 through 5) continue in oct3_ddr3_seq()
> +	 */
> +	union cvmx_lmcx_modereg_params0 mp0;
> +
> +	if (ddr_memory_preserved(priv)) {
> +		/*
> +		 * Contents are being preserved. Take DRAM out of self-refresh
> +		 * first. Then init steps can procede normally
> +		 */
> +		/* self-refresh exit */
> +		oct3_ddr3_seq(priv, rank_mask, if_num, 3);
> +	}
> +
> +	mp0.u64 = lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num));
> +	mp0.s.dllr = 1;		/* Set during first init sequence */
> +	lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num), mp0.u64);
> +
> +	ddr_init_seq(priv, rank_mask, if_num);
> +
> +	mp0.s.dllr = 0;		/* Clear for normal operation */
> +	lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num), mp0.u64);
> +
> +	if (spd_rdimm && ddr_type == DDR4_DRAM &&
> +	    octeon_is_cpuid(OCTEON_CN7XXX)) {
> +		debug("Running init sequence 1\n");
> +		change_rdimm_mpr_pattern(priv, rank_mask, if_num, dimm_count);
> +	}
> +
> +	memset(lanes, 0, sizeof(lanes));
> +	for (lane = 0; lane < last_lane; lane++) {
> +		// init all lanes to reset value
> +		dac_settings[lane] = 127;
> +	}
> +
> +	// FIXME: disable internal VREF if deskew is disabled?
> +	if (disable_deskew_training) {
> +		debug("N%d.LMC%d: internal VREF Training disabled, leaving them in RESET.\n",
> +		      node, if_num);
> +		num_samples = 0;
> +	} else if (ddr_type == DDR4_DRAM &&
> +		   !octeon_is_cpuid(OCTEON_CN78XX_PASS1_X)) {
> +		num_samples = DEFAULT_DAC_SAMPLES;
> +	} else {
> +		// if DDR3 or no ability to write DAC values
> +		num_samples = 1;
> +	}
> +
> +perform_internal_vref_training:
> +
> +	total_dac_eval_retries = 0;
> +	dac_eval_exhausted = 0;
> +
> +	for (sample = 0; sample < num_samples; sample++) {
> +		dac_eval_retries = 0;
> +
> +		// make offset and internal vref training repeatable
> +		do {
> +			/*
> +			 * 6.9.8 LMC Offset Training
> +			 * LMC requires input-receiver offset training.
> +			 */
> +			perform_offset_training(priv, rank_mask, if_num);
> +
> +			/*
> +			 * 6.9.9 LMC Internal vref Training
> +			 * LMC requires input-reference-voltage training.
> +			 */
> +			perform_internal_vref_training(priv, rank_mask, if_num);
> +
> +			// read and maybe display the DAC values for a sample
> +			read_dac_dbi_settings(priv, if_num, /*DAC*/ 1,
> +					      dac_settings);
> +			if (num_samples == 1 || ddr_verbose(priv)) {
> +				display_dac_dbi_settings(if_num, /*DAC*/ 1,
> +							 use_ecc, dac_settings,
> +							 "Internal VREF");
> +			}
> +
> +			// for DDR4, evaluate the DAC settings and retry
> +			// if any issues
> +			if (ddr_type == DDR4_DRAM) {
> +				if (evaluate_dac_settings
> +				    (if_64b, use_ecc, dac_settings)) {
> +					dac_eval_retries += 1;
> +					if (dac_eval_retries >
> +					    DAC_RETRIES_LIMIT) {
> +						debug("N%d.LMC%d: DDR4 internal VREF DAC settings: retries exhausted; continuing...\n",
> +						      node, if_num);
> +						dac_eval_exhausted += 1;
> +					} else {
> +						debug("N%d.LMC%d: DDR4 internal VREF DAC settings inconsistent; retrying....\n",
> +						      node, if_num);
> +						total_dac_eval_retries += 1;
> +						// try another sample
> +						continue;
> +					}
> +				}
> +
> +				// taking multiple samples, otherwise do nothing
> +				if (num_samples > 1) {
> +					// good sample or exhausted retries,
> +					// record it
> +					for (lane = 0; lane < last_lane;
> +					     lane++) {
> +						lanes[lane].bytes[sample] =
> +						    dac_settings[lane];
> +					}
> +				}
> +			}
> +			// done if DDR3, or good sample, or exhausted retries
> +			break;
> +		} while (1);
> +	}
> +
> +	if (ddr_type == DDR4_DRAM && dac_eval_exhausted > 0) {
> +		debug("N%d.LMC%d: DDR internal VREF DAC settings: total retries %d, exhausted %d\n",
> +		      node, if_num, total_dac_eval_retries, dac_eval_exhausted);
> +	}
> +
> +	if (num_samples > 1) {
> +		debug("N%d.LMC%d: DDR4 internal VREF DAC settings: processing multiple samples...\n",
> +		      node, if_num);
> +
> +		for (lane = 0; lane < last_lane; lane++) {
> +			dac_settings[lane] =
> +			    process_samples_average(&lanes[lane].bytes[0],
> +						    num_samples, if_num, lane);
> +		}
> +		display_dac_dbi_settings(if_num, /*DAC*/ 1, use_ecc,
> +					 dac_settings, "Averaged VREF");
> +
> +		// finally, write the final DAC values
> +		for (lane = 0; lane < last_lane; lane++) {
> +			load_dac_override(priv, if_num, dac_settings[lane],
> +					  lane);
> +		}
> +	}
> +
> +	// allow override of any byte-lane internal VREF
> +	int overrode_vref_dac = 0;
> +
> +	for (lane = 0; lane < last_lane; lane++) {
> +		s = lookup_env(priv, "ddr%d_vref_dac_byte%d", if_num, lane);
> +		if (s) {
> +			dac_settings[lane] = simple_strtoul(s, NULL, 0);
> +			overrode_vref_dac = 1;
> +			// finally, write the new DAC value
> +			load_dac_override(priv, if_num, dac_settings[lane],
> +					  lane);
> +		}
> +	}
> +	if (overrode_vref_dac) {
> +		display_dac_dbi_settings(if_num, /*DAC*/ 1, use_ecc,
> +					 dac_settings, "Override VREF");
> +	}
> +
> +#if DDR3_DAC_OVERRIDE
> +	// as a second step, after internal VREF training, before starting
> +	// deskew training:
> +	// for DDR3 and OCTEON3 not O78 pass 1.x, override the DAC setting
> +	// to 127
> +	if (ddr_type == DDR3_DRAM && !octeon_is_cpuid(OCTEON_CN78XX_PASS1_X) &&
> +	    !disable_deskew_training) {
> +		load_dac_override(priv, if_num, 127, /* all */ 0x0A);
> +		debug("N%d.LMC%d: Overriding DDR3 internal VREF DAC settings to 127.\n",
> +		      node, if_num);
> +	}
> +#endif
> +
> +	/*
> +	 * 4.8.8 LMC Deskew Training
> +	 *
> +	 * LMC requires input-read-data deskew training.
> +	 */
> +	if (!disable_deskew_training) {
> +		deskew_training_errors =
> +		    perform_deskew_training(priv, rank_mask, if_num,
> +					    spd_rawcard_aorb);
> +
> +		// All the Deskew lock and saturation retries (may) have
> +		// been done, but we ended up with nibble errors; so,
> +		// as a last ditch effort, try the Internal vref
> +		// Training again...
> +		if (deskew_training_errors) {
> +			if (internal_retries <
> +			    DEFAULT_INTERNAL_VREF_TRAINING_LIMIT) {
> +				internal_retries++;
> +				debug("N%d.LMC%d: Deskew training results still unsettled - retrying internal vref training (%d)\n",
> +				      node, if_num, internal_retries);
> +				goto perform_internal_vref_training;
> +			} else {
> +				if (restart_if_dsk_incomplete) {
> +					debug("N%d.LMC%d: INFO: Deskew training incomplete - %d retries exhausted, Restarting LMC init...\n",
> +					      node, if_num, internal_retries);
> +					return -EAGAIN;
> +				}
> +				debug("N%d.LMC%d: Deskew training incomplete - %d retries exhausted, but continuing...\n",
> +				      node, if_num, internal_retries);
> +			}
> +		}		/* if (deskew_training_errors) */
> +
> +		// FIXME: treat this as the final DSK print from now on,
> +		// and print if VBL_NORM or above also, save the results
> +		// of the original training in case we want them later
> +		validate_deskew_training(priv, rank_mask, if_num,
> +					 &deskew_training_results, 1);
> +	} else {		/* if (! disable_deskew_training) */
> +		debug("N%d.LMC%d: Deskew Training disabled, printing settings before HWL.\n",
> +		      node, if_num);
> +		validate_deskew_training(priv, rank_mask, if_num,
> +					 &deskew_training_results, 1);
> +	}			/* if (! disable_deskew_training) */
> +
> +#if ALLOW_BY_RANK_INIT
> +
> +	if (enable_by_rank_init) {
> +		read_dac_dbi_settings(priv, if_num, /*dac */ 1,
> +				      &rank_dac[by_rank].bytes[0]);
> +		get_deskew_settings(priv, if_num, &rank_dsk[by_rank]);
> +		debug("\n>>>>> BY_RANK: ending rank %d\n\n", by_rank);
> +	}
> +
> +end_by_rank_init:
> +
> +	if (enable_by_rank_init) {
> +		//debug("\n>>>>> BY_RANK: ending rank %d\n\n", by_rank);
> +
> +		by_rank--;
> +		if (by_rank >= 0)
> +			goto start_by_rank_init;
> +
> +		rank_mask = saved_rank_mask;
> +		ddr_init_seq(priv, rank_mask, if_num);
> +
> +		process_by_rank_dac(priv, if_num, rank_mask, rank_dac);
> +		process_by_rank_dsk(priv, if_num, rank_mask, rank_dsk);
> +
> +		// FIXME: set this to prevent later checking!!!
> +		disable_deskew_training = 1;
> +
> +		debug("\n>>>>> BY_RANK: FINISHED!!\n\n");
> +	}
> +#endif /* ALLOW_BY_RANK_INIT */
> +
> +	return 0;
> +}
> +
> +static void lmc_config_2(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_config lmc_config;
> +	int save_ref_zqcs_int;
> +	u64 temp_delay_usecs;
> +
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +
> +	/*
> +	 * Temporarily select the minimum ZQCS interval and wait
> +	 * long enough for a few ZQCS calibrations to occur.  This
> +	 * should ensure that the calibration circuitry is
> +	 * stabilized before read/write leveling occurs.
> +	 */
> +	if (octeon_is_cpuid(OCTEON_CN7XXX)) {
> +		save_ref_zqcs_int = lmc_config.cn78xx.ref_zqcs_int;
> +		/* set smallest interval */
> +		lmc_config.cn78xx.ref_zqcs_int = 1 | (32 << 7);
> +	} else {
> +		save_ref_zqcs_int = lmc_config.cn63xx.ref_zqcs_int;
> +		/* set smallest interval */
> +		lmc_config.cn63xx.ref_zqcs_int = 1 | (32 << 7);
> +	}
> +	lmc_wr(priv, CVMX_LMCX_CONFIG(if_num), lmc_config.u64);
> +	lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +
> +	/*
> +	 * Compute an appropriate delay based on the current ZQCS
> +	 * interval. The delay should be long enough for the
> +	 * current ZQCS delay counter to expire plus ten of the
> +	 * minimum intarvals to ensure that some calibrations
> +	 * occur.
> +	 */
> +	temp_delay_usecs = (((u64)save_ref_zqcs_int >> 7) * tclk_psecs *
> +			    100 * 512 * 128) / (10000 * 10000) + 10 *
> +		((u64)32 * tclk_psecs * 100 * 512 * 128) / (10000 * 10000);
> +
> +	debug("Waiting %lld usecs for ZQCS calibrations to start\n",
> +	      temp_delay_usecs);
> +	udelay(temp_delay_usecs);
> +
> +	if (octeon_is_cpuid(OCTEON_CN7XXX)) {
> +		/* Restore computed interval */
> +		lmc_config.cn78xx.ref_zqcs_int = save_ref_zqcs_int;
> +	} else {
> +		/* Restore computed interval */
> +		lmc_config.cn63xx.ref_zqcs_int = save_ref_zqcs_int;
> +	}
> +
> +	lmc_wr(priv, CVMX_LMCX_CONFIG(if_num), lmc_config.u64);
> +	lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +}
> +
> +static union cvmx_lmcx_wlevel_ctl wl_ctl __section(".data");
> +static union cvmx_lmcx_wlevel_rankx wl_rank __section(".data");
> +static union cvmx_lmcx_modereg_params1 mp1 __section(".data");
> +
> +static int wl_mask[9] __section(".data");
> +static int byte_idx __section(".data");
> +static int ecc_ena __section(".data");
> +static int wl_roundup __section(".data");
> +static int save_mode32b __section(".data");
> +static int disable_hwl_validity __section(".data");
> +static int default_wl_rtt_nom __section(".data");
> +static int wl_pbm_pump __section(".data");
> +
> +static void lmc_write_leveling_loop(struct ddr_priv *priv, int rankx)
> +{
> +	int wloop = 0;
> +	// retries per sample for HW-related issues with bitmasks or values
> +	int wloop_retries = 0;
> +	int wloop_retries_total = 0;
> +	int wloop_retries_exhausted = 0;
> +#define WLOOP_RETRIES_DEFAULT 5
> +	int wl_val_err;
> +	int wl_mask_err_rank = 0;
> +	int wl_val_err_rank = 0;
> +#if HW_WL_MAJORITY
> +	// array to collect counts of byte-lane values
> +	// assume low-order 3 bits and even, so really only 2-bit values
> +	struct wlevel_bitcnt wl_bytes[9], wl_bytes_extra[9];
> +	int extra_bumps, extra_mask;
> +#endif
> +	int rank_nom = 0;
> +
> +	if (!(rank_mask & (1 << rankx)))
> +		return;
> +
> +	if (match_wl_rtt_nom) {
> +		if (rankx == 0)
> +			rank_nom = mp1.s.rtt_nom_00;
> +		if (rankx == 1)
> +			rank_nom = mp1.s.rtt_nom_01;
> +		if (rankx == 2)
> +			rank_nom = mp1.s.rtt_nom_10;
> +		if (rankx == 3)
> +			rank_nom = mp1.s.rtt_nom_11;
> +
> +		debug("N%d.LMC%d.R%d: Setting WLEVEL_CTL[rtt_nom] to %d (%d)\n",
> +		      node, if_num, rankx, rank_nom,
> +		      imp_val->rtt_nom_ohms[rank_nom]);
> +	}
> +
> +#if HW_WL_MAJORITY
> +	memset(wl_bytes, 0, sizeof(wl_bytes));
> +	memset(wl_bytes_extra, 0, sizeof(wl_bytes_extra));
> +#endif
> +
> +	// restructure the looping so we can keep trying until we get the
> +	// samples we want
> +	while (wloop < wl_loops) {
> +		wl_ctl.u64 = lmc_rd(priv, CVMX_LMCX_WLEVEL_CTL(if_num));
> +
> +		wl_ctl.cn78xx.rtt_nom =
> +		    (default_wl_rtt_nom > 0) ? (default_wl_rtt_nom - 1) : 7;
> +
> +		if (match_wl_rtt_nom) {
> +			wl_ctl.cn78xx.rtt_nom =
> +			    (rank_nom > 0) ? (rank_nom - 1) : 7;
> +		}
> +
> +		/* Clear write-level delays */
> +		lmc_wr(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num), 0);
> +
> +		wl_mask_err = 0;	/* Reset error counters */
> +		wl_val_err = 0;
> +
> +		for (byte_idx = 0; byte_idx < 9; ++byte_idx)
> +			wl_mask[byte_idx] = 0;	/* Reset bitmasks */
> +
> +		// do all the byte-lanes at the same time
> +		wl_ctl.cn78xx.lanemask = 0x1ff;
> +
> +		lmc_wr(priv, CVMX_LMCX_WLEVEL_CTL(if_num), wl_ctl.u64);
> +
> +		/*
> +		 * Read and write values back in order to update the
> +		 * status field. This insures that we read the updated
> +		 * values after write-leveling has completed.
> +		 */
> +		lmc_wr(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num),
> +		       lmc_rd(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num)));
> +
> +		/* write-leveling */
> +		oct3_ddr3_seq(priv, 1 << rankx, if_num, 6);
> +
> +		do {
> +			wl_rank.u64 = lmc_rd(priv,
> +					     CVMX_LMCX_WLEVEL_RANKX(rankx,
> +								    if_num));
> +		} while (wl_rank.cn78xx.status != 3);
> +
> +		wl_rank.u64 = lmc_rd(priv, CVMX_LMCX_WLEVEL_RANKX(rankx,
> +								  if_num));
> +
> +		for (byte_idx = 0; byte_idx < (8 + ecc_ena); ++byte_idx) {
> +			wl_mask[byte_idx] = lmc_ddr3_wl_dbg_read(priv,
> +								 if_num,
> +								 byte_idx);
> +			if (wl_mask[byte_idx] == 0)
> +				++wl_mask_err;
> +		}
> +
> +		// check validity only if no bitmask errors
> +		if (wl_mask_err == 0) {
> +			if ((spd_dimm_type == 1 || spd_dimm_type == 2) &&
> +			    dram_width != 16 && if_64b &&
> +			    !disable_hwl_validity) {
> +				// bypass if [mini|SO]-[RU]DIMM or x16 or
> +				// 32-bit
> +				wl_val_err =
> +				    validate_hw_wl_settings(if_num,
> +							    &wl_rank,
> +							    spd_rdimm, ecc_ena);
> +				wl_val_err_rank += (wl_val_err != 0);
> +			}
> +		} else {
> +			wl_mask_err_rank++;
> +		}
> +
> +		// before we print, if we had bitmask or validity errors,
> +		// do a retry...
> +		if (wl_mask_err != 0 || wl_val_err != 0) {
> +			if (wloop_retries < WLOOP_RETRIES_DEFAULT) {
> +				wloop_retries++;
> +				wloop_retries_total++;
> +				// this printout is per-retry: only when VBL
> +				// is high enough (DEV?)
> +				// FIXME: do we want to show the bad bitmaps
> +				// or delays here also?
> +				debug("N%d.LMC%d.R%d: H/W Write-Leveling had %s errors - retrying...\n",
> +				      node, if_num, rankx,
> +				      (wl_mask_err) ? "Bitmask" : "Validity");
> +				// this takes us back to the top without
> +				// counting a sample
> +				return;
> +			}
> +
> +			// retries exhausted, do not print at normal VBL
> +			debug("N%d.LMC%d.R%d: H/W Write-Leveling issues: %s errors\n",
> +			      node, if_num, rankx,
> +			      (wl_mask_err) ? "Bitmask" : "Validity");
> +			wloop_retries_exhausted++;
> +		}
> +		// no errors or exhausted retries, use this sample
> +		wloop_retries = 0;	//reset for next sample
> +
> +		// when only 1 sample or forced, print the bitmasks then
> +		// current HW WL
> +		if (wl_loops == 1 || wl_print) {
> +			if (wl_print > 1)
> +				display_wl_bm(if_num, rankx, wl_mask);
> +			display_wl(if_num, wl_rank, rankx);
> +		}
> +
> +		if (wl_roundup) {	/* Round up odd bitmask delays */
> +			for (byte_idx = 0; byte_idx < (8 + ecc_ena);
> +			     ++byte_idx) {
> +				if (!(if_bytemask & (1 << byte_idx)))
> +					return;
> +				upd_wl_rank(&wl_rank, byte_idx,
> +					    roundup_ddr3_wlevel_bitmask
> +					    (wl_mask[byte_idx]));
> +			}
> +			lmc_wr(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num),
> +			       wl_rank.u64);
> +			display_wl(if_num, wl_rank, rankx);
> +		}
> +
> +#if HW_WL_MAJORITY
> +		// OK, we have a decent sample, no bitmask or validity errors
> +		extra_bumps = 0;
> +		extra_mask = 0;
> +		for (byte_idx = 0; byte_idx < (8 + ecc_ena); ++byte_idx) {
> +			int ix;
> +
> +			if (!(if_bytemask & (1 << byte_idx)))
> +				return;
> +
> +			// increment count of byte-lane value
> +			// only 4 values
> +			ix = (get_wl_rank(&wl_rank, byte_idx) >> 1) & 3;
> +			wl_bytes[byte_idx].bitcnt[ix]++;
> +			wl_bytes_extra[byte_idx].bitcnt[ix]++;
> +			// if perfect...
> +			if (__builtin_popcount(wl_mask[byte_idx]) == 4) {
> +				wl_bytes_extra[byte_idx].bitcnt[ix] +=
> +				    wl_pbm_pump;
> +				extra_bumps++;
> +				extra_mask |= 1 << byte_idx;
> +			}
> +		}
> +
> +		if (extra_bumps) {
> +			if (wl_print > 1) {
> +				debug("N%d.LMC%d.R%d: HWL sample had %d bumps (0x%02x).\n",
> +				      node, if_num, rankx, extra_bumps,
> +				      extra_mask);
> +			}
> +		}
> +#endif
> +
> +		// if we get here, we have taken a decent sample
> +		wloop++;
> +
> +	}			/* while (wloop < wl_loops) */
> +
> +#if HW_WL_MAJORITY
> +	// if we did sample more than once, try to pick a majority vote
> +	if (wl_loops > 1) {
> +		// look for the majority in each byte-lane
> +		for (byte_idx = 0; byte_idx < (8 + ecc_ena); ++byte_idx) {
> +			int mx, mc, xc, cc;
> +			int ix, alts;
> +			int maj, xmaj, xmx, xmc, xxc, xcc;
> +
> +			if (!(if_bytemask & (1 << byte_idx)))
> +				return;
> +			maj = find_wl_majority(&wl_bytes[byte_idx], &mx,
> +					       &mc, &xc, &cc);
> +			xmaj = find_wl_majority(&wl_bytes_extra[byte_idx],
> +						&xmx, &xmc, &xxc, &xcc);
> +			if (maj != xmaj) {
> +				if (wl_print) {
> +					debug("N%d.LMC%d.R%d: Byte %d: HWL maj %d(%d), USING xmaj %d(%d)\n",
> +					      node, if_num, rankx,
> +					      byte_idx, maj, xc, xmaj, xxc);
> +				}
> +				mx = xmx;
> +				mc = xmc;
> +				xc = xxc;
> +				cc = xcc;
> +			}
> +
> +#if SWL_TRY_HWL_ALT
> +			// see if there was an alternate
> +			// take out the majority choice
> +			alts = (mc & ~(1 << mx));
> +			if (alts != 0) {
> +				for (ix = 0; ix < 4; ix++) {
> +					// FIXME: could be done multiple times?
> +					// bad if so
> +					if (alts & (1 << ix)) {
> +						// set the mask
> +						hwl_alts[rankx].hwl_alt_mask |=
> +							(1 << byte_idx);
> +						// record the value
> +						hwl_alts[rankx].hwl_alt_delay[byte_idx] =
> +							ix << 1;
> +						if (wl_print > 1) {
> +							debug("N%d.LMC%d.R%d: SWL_TRY_HWL_ALT: Byte %d maj %d (%d) alt %d (%d).\n",
> +							      node,
> +							      if_num,
> +							      rankx,
> +							      byte_idx,
> +							      mx << 1,
> +							      xc,
> +							      ix << 1,
> +							      wl_bytes
> +							      [byte_idx].bitcnt
> +							      [ix]);
> +						}
> +					}
> +				}
> +			}
> +#endif /* SWL_TRY_HWL_ALT */
> +
> +			if (cc > 2) {	// unlikely, but...
> +				// assume: counts for 3 indices are all 1
> +				// possiblities are: 0/2/4, 2/4/6, 0/4/6, 0/2/6
> +				// and the desired?:   2  ,   4  ,     6, 0
> +				// we choose the middle, assuming one of the
> +				// outliers is bad
> +				// NOTE: this is an ugly hack at the moment;
> +				// there must be a better way
> +				switch (mc) {
> +				case 0x7:
> +					mx = 1;
> +					break;	// was 0/2/4, choose 2
> +				case 0xb:
> +					mx = 0;
> +					break;	// was 0/2/6, choose 0
> +				case 0xd:
> +					mx = 3;
> +					break;	// was 0/4/6, choose 6
> +				case 0xe:
> +					mx = 2;
> +					break;	// was 2/4/6, choose 4
> +				default:
> +				case 0xf:
> +					mx = 1;
> +					break;	// was 0/2/4/6, choose 2?
> +				}
> +				printf("N%d.LMC%d.R%d: HW WL MAJORITY: bad byte-lane %d (0x%x), using %d.\n",
> +				       node, if_num, rankx, byte_idx, mc,
> +				       mx << 1);
> +			}
> +			upd_wl_rank(&wl_rank, byte_idx, mx << 1);
> +		}
> +
> +		lmc_wr(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num),
> +		       wl_rank.u64);
> +		display_wl_with_final(if_num, wl_rank, rankx);
> +
> +		// FIXME: does this help make the output a little easier
> +		// to focus?
> +		if (wl_print > 0)
> +			debug("-----------\n");
> +
> +	}			/* if (wl_loops > 1) */
> +#endif /* HW_WL_MAJORITY */
> +
> +	// maybe print an error summary for the rank
> +	if (wl_mask_err_rank != 0 || wl_val_err_rank != 0) {
> +		debug("N%d.LMC%d.R%d: H/W Write-Leveling errors - %d bitmask, %d validity, %d retries, %d exhausted\n",
> +		      node, if_num, rankx, wl_mask_err_rank,
> +		      wl_val_err_rank, wloop_retries_total,
> +		      wloop_retries_exhausted);
> +	}
> +}
> +
> +static void lmc_write_leveling(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_config cfg;
> +	int rankx;
> +	char *s;
> +
> +	/*
> +	 * 4.8.9 LMC Write Leveling
> +	 *
> +	 * LMC supports an automatic write leveling like that described in the
> +	 * JEDEC DDR3 specifications separately per byte-lane.
> +	 *
> +	 * All of DDR PLL, LMC CK, LMC DRESET, and early LMC initializations
> +	 * must be completed prior to starting this LMC write-leveling sequence.
> +	 *
> +	 * There are many possible procedures that will write-level all the
> +	 * attached DDR3 DRAM parts. One possibility is for software to simply
> +	 * write the desired values into LMC(0)_WLEVEL_RANK(0..3). This section
> +	 * describes one possible sequence that uses LMC's autowrite-leveling
> +	 * capabilities.
> +	 *
> +	 * 1. If the DQS/DQ delays on the board may be more than the ADD/CMD
> +	 *    delays, then ensure that LMC(0)_CONFIG[EARLY_DQX] is set at this
> +	 *    point.
> +	 *
> +	 * Do the remaining steps 2-7 separately for each rank i with attached
> +	 * DRAM.
> +	 *
> +	 * 2. Write LMC(0)_WLEVEL_RANKi = 0.
> +	 *
> +	 * 3. For x8 parts:
> +	 *
> +	 *    Without changing any other fields in LMC(0)_WLEVEL_CTL, write
> +	 *    LMC(0)_WLEVEL_CTL[LANEMASK] to select all byte lanes with attached
> +	 *    DRAM.
> +	 *
> +	 *    For x16 parts:
> +	 *
> +	 *    Without changing any other fields in LMC(0)_WLEVEL_CTL, write
> +	 *    LMC(0)_WLEVEL_CTL[LANEMASK] to select all even byte lanes with
> +	 *    attached DRAM.
> +	 *
> +	 * 4. Without changing any other fields in LMC(0)_CONFIG,
> +	 *
> +	 *    o write LMC(0)_SEQ_CTL[SEQ_SEL] to select write-leveling
> +	 *
> +	 *    o write LMC(0)_CONFIG[RANKMASK] = (1 << i)
> +	 *
> +	 *    o write LMC(0)_SEQ_CTL[INIT_START] = 1
> +	 *
> +	 *    LMC will initiate write-leveling at this point. Assuming
> +	 *    LMC(0)_WLEVEL_CTL [SSET] = 0, LMC first enables write-leveling on
> +	 *    the selected DRAM rank via a DDR3 MR1 write, then sequences
> +	 *    through
> +	 *    and accumulates write-leveling results for eight different delay
> +	 *    settings twice, starting at a delay of zero in this case since
> +	 *    LMC(0)_WLEVEL_RANKi[BYTE*<4:3>] = 0, increasing by 1/8 CK each
> +	 *    setting, covering a total distance of one CK, then disables the
> +	 *    write-leveling via another DDR3 MR1 write.
> +	 *
> +	 *    After the sequence through 16 delay settings is complete:
> +	 *
> +	 *    o LMC sets LMC(0)_WLEVEL_RANKi[STATUS] = 3
> +	 *
> +	 *    o LMC sets LMC(0)_WLEVEL_RANKi[BYTE*<2:0>] (for all ranks selected
> +	 *      by LMC(0)_WLEVEL_CTL[LANEMASK]) to indicate the first write
> +	 *      leveling result of 1 that followed result of 0 during the
> +	 *      sequence, except that the LMC always writes
> +	 *      LMC(0)_WLEVEL_RANKi[BYTE*<0>]=0.
> +	 *
> +	 *    o Software can read the eight write-leveling results from the
> +	 *      first pass through the delay settings by reading
> +	 *      LMC(0)_WLEVEL_DBG[BITMASK] (after writing
> +	 *      LMC(0)_WLEVEL_DBG[BYTE]). (LMC does not retain the writeleveling
> +	 *      results from the second pass through the eight delay
> +	 *      settings. They should often be identical to the
> +	 *      LMC(0)_WLEVEL_DBG[BITMASK] results, though.)
> +	 *
> +	 * 5. Wait until LMC(0)_WLEVEL_RANKi[STATUS] != 2.
> +	 *
> +	 *    LMC will have updated LMC(0)_WLEVEL_RANKi[BYTE*<2:0>] for all byte
> +	 *    lanes selected by LMC(0)_WLEVEL_CTL[LANEMASK] at this point.
> +	 *    LMC(0)_WLEVEL_RANKi[BYTE*<4:3>] will still be the value that
> +	 *    software wrote in substep 2 above, which is 0.
> +	 *
> +	 * 6. For x16 parts:
> +	 *
> +	 *    Without changing any other fields in LMC(0)_WLEVEL_CTL, write
> +	 *    LMC(0)_WLEVEL_CTL[LANEMASK] to select all odd byte lanes with
> +	 *    attached DRAM.
> +	 *
> +	 *    Repeat substeps 4 and 5 with this new LMC(0)_WLEVEL_CTL[LANEMASK]
> +	 *    setting. Skip to substep 7 if this has already been done.
> +	 *
> +	 *    For x8 parts:
> +	 *
> +	 *    Skip this substep. Go to substep 7.
> +	 *
> +	 * 7. Calculate LMC(0)_WLEVEL_RANKi[BYTE*<4:3>] settings for all byte
> +	 *    lanes on all ranks with attached DRAM.
> +	 *
> +	 *    At this point, all byte lanes on rank i with attached DRAM should
> +	 *    have been write-leveled, and LMC(0)_WLEVEL_RANKi[BYTE*<2:0>] has
> +	 *    the result for each byte lane.
> +	 *
> +	 *    But note that the DDR3 write-leveling sequence will only determine
> +	 *    the delay modulo the CK cycle time, and cannot determine how many
> +	 *    additional CK cycles of delay are present. Software must calculate
> +	 *    the number of CK cycles, or equivalently, the
> +	 *    LMC(0)_WLEVEL_RANKi[BYTE*<4:3>] settings.
> +	 *
> +	 *    This BYTE*<4:3> calculation is system/board specific.
> +	 *
> +	 * Many techniques can be used to calculate write-leveling BYTE*<4:3>
> +	 * values, including:
> +	 *
> +	 *    o Known values for some byte lanes.
> +	 *
> +	 *    o Relative values for some byte lanes relative to others.
> +	 *
> +	 *    For example, suppose lane X is likely to require a larger
> +	 *    write-leveling delay than lane Y. A BYTEX<2:0> value that is much
> +	 *    smaller than the BYTEY<2:0> value may then indicate that the
> +	 *    required lane X delay wrapped into the next CK, so BYTEX<4:3>
> +	 *    should be set to BYTEY<4:3>+1.
> +	 *
> +	 *    When ECC DRAM is not present (i.e. when DRAM is not attached to
> +	 *    the DDR_CBS_0_* and DDR_CB<7:0> chip signals, or the
> +	 *    DDR_DQS_<4>_* and DDR_DQ<35:32> chip signals), write
> +	 *    LMC(0)_WLEVEL_RANK*[BYTE8] = LMC(0)_WLEVEL_RANK*[BYTE0],
> +	 *    using the final calculated BYTE0 value.
> +	 *    Write LMC(0)_WLEVEL_RANK*[BYTE4] = LMC(0)_WLEVEL_RANK*[BYTE0],
> +	 *    using the final calculated BYTE0 value.
> +	 *
> +	 * 8. Initialize LMC(0)_WLEVEL_RANK* values for all unused ranks.
> +	 *
> +	 *    Let rank i be a rank with attached DRAM.
> +	 *
> +	 *    For all ranks j that do not have attached DRAM, set
> +	 *    LMC(0)_WLEVEL_RANKj = LMC(0)_WLEVEL_RANKi.
> +	 */
> +
> +	rankx = 0;
> +	wl_roundup = 0;
> +	disable_hwl_validity = 0;
> +
> +	// wl_pbm_pump: weight for write-leveling PBMs...
> +	// 0 causes original behavior
> +	// 1 allows a minority of 2 pbms to outscore a majority of 3 non-pbms
> +	// 4 would allow a minority of 1 pbm to outscore a majority of 4
> +	// non-pbms
> +	wl_pbm_pump = 4;	// FIXME: is 4 too much?
> +
> +	if (wl_loops) {
> +		debug("N%d.LMC%d: Performing Hardware Write-Leveling\n", node,
> +		      if_num);
> +	} else {
> +		/* Force software write-leveling to run */
> +		wl_mask_err = 1;
> +		debug("N%d.LMC%d: Forcing software Write-Leveling\n", node,
> +		      if_num);
> +	}
> +
> +	default_wl_rtt_nom = (ddr_type == DDR3_DRAM) ?
> +		rttnom_20ohm : ddr4_rttnom_40ohm;
> +
> +	cfg.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	ecc_ena = cfg.s.ecc_ena;
> +	save_mode32b = cfg.cn78xx.mode32b;
> +	cfg.cn78xx.mode32b = (!if_64b);
> +	lmc_wr(priv, CVMX_LMCX_CONFIG(if_num), cfg.u64);
> +	debug("%-45s : %d\n", "MODE32B", cfg.cn78xx.mode32b);
> +
> +	s = lookup_env(priv, "ddr_wlevel_roundup");
> +	if (s)
> +		wl_roundup = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_wlevel_printall");
> +	if (s)
> +		wl_print = strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_wlevel_pbm_bump");
> +	if (s)
> +		wl_pbm_pump = strtoul(s, NULL, 0);
> +
> +	// default to disable when RL sequential delay check is disabled
> +	disable_hwl_validity = disable_sequential_delay_check;
> +	s = lookup_env(priv, "ddr_disable_hwl_validity");
> +	if (s)
> +		disable_hwl_validity = !!strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_wl_rtt_nom");
> +	if (s)
> +		default_wl_rtt_nom = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_match_wl_rtt_nom");
> +	if (s)
> +		match_wl_rtt_nom = !!simple_strtoul(s, NULL, 0);
> +
> +	if (match_wl_rtt_nom)
> +		mp1.u64 = lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS1(if_num));
> +
> +	// For DDR3, we do not touch WLEVEL_CTL fields OR_DIS or BITMASK
> +	// For DDR4, we touch WLEVEL_CTL fields OR_DIS or BITMASK here
> +	if (ddr_type == DDR4_DRAM) {
> +		int default_or_dis = 1;
> +		int default_bitmask = 0xff;
> +
> +		// when x4, use only the lower nibble
> +		if (dram_width == 4) {
> +			default_bitmask = 0x0f;
> +			if (wl_print) {
> +				debug("N%d.LMC%d: WLEVEL_CTL: default bitmask is 0x%02x for DDR4 x4\n",
> +				      node, if_num, default_bitmask);
> +			}
> +		}
> +
> +		wl_ctl.u64 = lmc_rd(priv, CVMX_LMCX_WLEVEL_CTL(if_num));
> +		wl_ctl.s.or_dis = default_or_dis;
> +		wl_ctl.s.bitmask = default_bitmask;
> +
> +		// allow overrides
> +		s = lookup_env(priv, "ddr_wlevel_ctl_or_dis");
> +		if (s)
> +			wl_ctl.s.or_dis = !!strtoul(s, NULL, 0);
> +
> +		s = lookup_env(priv, "ddr_wlevel_ctl_bitmask");
> +		if (s)
> +			wl_ctl.s.bitmask = simple_strtoul(s, NULL, 0);
> +
> +		// print only if not defaults
> +		if (wl_ctl.s.or_dis != default_or_dis ||
> +		    wl_ctl.s.bitmask != default_bitmask) {
> +			debug("N%d.LMC%d: WLEVEL_CTL: or_dis=%d, bitmask=0x%02x\n",
> +			      node, if_num, wl_ctl.s.or_dis, wl_ctl.s.bitmask);
> +		}
> +
> +		// always write
> +		lmc_wr(priv, CVMX_LMCX_WLEVEL_CTL(if_num), wl_ctl.u64);
> +	}
> +
> +	// Start the hardware write-leveling loop per rank
> +	for (rankx = 0; rankx < dimm_count * 4; rankx++)
> +		lmc_write_leveling_loop(priv, rankx);
> +
> +	cfg.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	cfg.cn78xx.mode32b = save_mode32b;
> +	lmc_wr(priv, CVMX_LMCX_CONFIG(if_num), cfg.u64);
> +	debug("%-45s : %d\n", "MODE32B", cfg.cn78xx.mode32b);
> +
> +	// At the end of HW Write Leveling, check on some DESKEW things...
> +	if (!disable_deskew_training) {
> +		struct deskew_counts dsk_counts;
> +		int retry_count = 0;
> +
> +		debug("N%d.LMC%d: Check Deskew Settings before Read-Leveling.\n",
> +		      node, if_num);
> +
> +		do {
> +			validate_deskew_training(priv, rank_mask, if_num,
> +						 &dsk_counts, 1);
> +
> +			// only RAWCARD A or B will not benefit from
> +			// retraining if there's only saturation
> +			// or any rawcard if there is a nibble error
> +			if ((!spd_rawcard_aorb && dsk_counts.saturated > 0) ||
> +			    (dsk_counts.nibrng_errs != 0 ||
> +			     dsk_counts.nibunl_errs != 0)) {
> +				retry_count++;
> +				debug("N%d.LMC%d: Deskew Status indicates saturation or nibble errors - retry %d Training.\n",
> +				      node, if_num, retry_count);
> +				perform_deskew_training(priv, rank_mask, if_num,
> +							spd_rawcard_aorb);
> +			} else {
> +				break;
> +			}
> +		} while (retry_count < 5);
> +	}
> +}
> +
> +static void lmc_workaround(struct ddr_priv *priv)
> +{
> +	/* Workaround Trcd overflow by using Additive latency. */
> +	if (octeon_is_cpuid(OCTEON_CN78XX_PASS1_X)) {
> +		union cvmx_lmcx_modereg_params0 mp0;
> +		union cvmx_lmcx_timing_params1 tp1;
> +		union cvmx_lmcx_control ctrl;
> +		int rankx;
> +
> +		tp1.u64 = lmc_rd(priv, CVMX_LMCX_TIMING_PARAMS1(if_num));
> +		mp0.u64 = lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num));
> +		ctrl.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +
> +		if (tp1.cn78xx.trcd == 0) {
> +			debug("Workaround Trcd overflow by using Additive latency.\n");
> +			/* Hard code this to 12 and enable additive latency */
> +			tp1.cn78xx.trcd = 12;
> +			mp0.s.al = 2;	/* CL-2 */
> +			ctrl.s.pocas = 1;
> +
> +			debug("MODEREG_PARAMS0                               : 0x%016llx\n",
> +			      mp0.u64);
> +			lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num),
> +			       mp0.u64);
> +			debug("TIMING_PARAMS1                                : 0x%016llx\n",
> +			      tp1.u64);
> +			lmc_wr(priv, CVMX_LMCX_TIMING_PARAMS1(if_num), tp1.u64);
> +
> +			debug("LMC_CONTROL                                   : 0x%016llx\n",
> +			      ctrl.u64);
> +			lmc_wr(priv, CVMX_LMCX_CONTROL(if_num), ctrl.u64);
> +
> +			for (rankx = 0; rankx < dimm_count * 4; rankx++) {
> +				if (!(rank_mask & (1 << rankx)))
> +					continue;
> +
> +				/* MR1 */
> +				ddr4_mrw(priv, if_num, rankx, -1, 1, 0);
> +			}
> +		}
> +	}
> +
> +	// this is here just for output, to allow check of the Deskew
> +	// settings one last time...
> +	if (!disable_deskew_training) {
> +		struct deskew_counts dsk_counts;
> +
> +		debug("N%d.LMC%d: Check Deskew Settings before software Write-Leveling.\n",
> +		      node, if_num);
> +		validate_deskew_training(priv, rank_mask, if_num, &dsk_counts,
> +					 3);
> +	}
> +
> +	/*
> +	 * Workaround Errata 26304 (T88 at 2.0, O75 at 1.x, O78 at 2.x)
> +	 *
> +	 * When the CSRs LMCX_DLL_CTL3[WR_DESKEW_ENA] = 1 AND
> +	 * LMCX_PHY_CTL2[DQS[0..8]_DSK_ADJ] > 4, set
> +	 * LMCX_EXT_CONFIG[DRIVE_ENA_BPRCH] = 1.
> +	 */
> +	if (octeon_is_cpuid(OCTEON_CN78XX_PASS2_X) ||
> +	    octeon_is_cpuid(OCTEON_CNF75XX_PASS1_X)) {
> +		union cvmx_lmcx_dll_ctl3 dll_ctl3;
> +		union cvmx_lmcx_phy_ctl2 phy_ctl2;
> +		union cvmx_lmcx_ext_config ext_cfg;
> +		int increased_dsk_adj = 0;
> +		int byte;
> +
> +		phy_ctl2.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL2(if_num));
> +		ext_cfg.u64 = lmc_rd(priv, CVMX_LMCX_EXT_CONFIG(if_num));
> +		dll_ctl3.u64 = lmc_rd(priv, CVMX_LMCX_DLL_CTL3(if_num));
> +
> +		for (byte = 0; byte < 8; ++byte) {
> +			if (!(if_bytemask & (1 << byte)))
> +				continue;
> +			increased_dsk_adj |=
> +			    (((phy_ctl2.u64 >> (byte * 3)) & 0x7) > 4);
> +		}
> +
> +		if (dll_ctl3.s.wr_deskew_ena == 1 && increased_dsk_adj) {
> +			ext_cfg.s.drive_ena_bprch = 1;
> +			lmc_wr(priv, CVMX_LMCX_EXT_CONFIG(if_num), ext_cfg.u64);
> +			debug("LMC%d: Forcing DRIVE_ENA_BPRCH for Workaround Errata 26304.\n",
> +			      if_num);
> +		}
> +	}
> +}
> +
> +// Software Write-Leveling block
> +
> +#define VREF_RANGE1_LIMIT 0x33	// range1 is valid for 0x00 - 0x32
> +#define VREF_RANGE2_LIMIT 0x18	// range2 is valid for 0x00 - 0x17
> +// full window is valid for 0x00 to 0x4A
> +// let 0x00 - 0x17 be range2, 0x18 - 0x4a be range 1
> +#define VREF_LIMIT        (VREF_RANGE1_LIMIT + VREF_RANGE2_LIMIT)
> +#define VREF_FINAL        (VREF_LIMIT - 1)
> +
> +enum sw_wl_status {
> +	WL_ESTIMATED = 0, /* HW/SW wleveling failed. Reslt estimated */
> +	WL_HARDWARE = 1,	/* H/W wleveling succeeded */
> +	WL_SOFTWARE = 2, /* S/W wleveling passed 2 contiguous setting */
> +	WL_SOFTWARE1 = 3, /* S/W wleveling passed 1 marginal setting */
> +};
> +
> +static u64 rank_addr __section(".data");
> +static int vref_val __section(".data");
> +static int final_vref_val __section(".data");
> +static int final_vref_range __section(".data");
> +static int start_vref_val __section(".data");
> +static int computed_final_vref_val __section(".data");
> +static char best_vref_val_count __section(".data");
> +static char vref_val_count __section(".data");
> +static char best_vref_val_start __section(".data");
> +static char vref_val_start __section(".data");
> +static int bytes_failed __section(".data");
> +static enum sw_wl_status byte_test_status[9] __section(".data");
> +static enum sw_wl_status sw_wl_rank_status __section(".data");
> +static int sw_wl_failed __section(".data");
> +static int sw_wl_hw __section(".data");
> +static int measured_vref_flag __section(".data");
> +
> +static void ddr4_vref_loop(struct ddr_priv *priv, int rankx)
> +{
> +	char *s;
> +
> +	if (vref_val < VREF_FINAL) {
> +		int vrange, vvalue;
> +
> +		if (vref_val < VREF_RANGE2_LIMIT) {
> +			vrange = 1;
> +			vvalue = vref_val;
> +		} else {
> +			vrange = 0;
> +			vvalue = vref_val - VREF_RANGE2_LIMIT;
> +		}
> +
> +		set_vref(priv, if_num, rankx, vrange, vvalue);
> +	} else {		/* if (vref_val < VREF_FINAL) */
> +		/* Print the final vref value first. */
> +
> +		/* Always print the computed first if its valid */
> +		if (computed_final_vref_val >= 0) {
> +			debug("N%d.LMC%d.R%d: vref Computed Summary                 :              %2d (0x%02x)\n",
> +			      node, if_num, rankx,
> +			      computed_final_vref_val, computed_final_vref_val);
> +		}
> +
> +		if (!measured_vref_flag) {	// setup to use the computed
> +			best_vref_val_count = 1;
> +			final_vref_val = computed_final_vref_val;
> +		} else {	// setup to use the measured
> +			if (best_vref_val_count > 0) {
> +				best_vref_val_count =
> +				    max(best_vref_val_count, (char)2);
> +				final_vref_val = best_vref_val_start +
> +					divide_nint(best_vref_val_count - 1, 2);
> +
> +				if (final_vref_val < VREF_RANGE2_LIMIT) {
> +					final_vref_range = 1;
> +				} else {
> +					final_vref_range = 0;
> +					final_vref_val -= VREF_RANGE2_LIMIT;
> +				}
> +
> +				int vvlo = best_vref_val_start;
> +				int vrlo;
> +				int vvhi = best_vref_val_start +
> +					best_vref_val_count - 1;
> +				int vrhi;
> +
> +				if (vvlo < VREF_RANGE2_LIMIT) {
> +					vrlo = 2;
> +				} else {
> +					vrlo = 1;
> +					vvlo -= VREF_RANGE2_LIMIT;
> +				}
> +
> +				if (vvhi < VREF_RANGE2_LIMIT) {
> +					vrhi = 2;
> +				} else {
> +					vrhi = 1;
> +					vvhi -= VREF_RANGE2_LIMIT;
> +				}
> +				debug("N%d.LMC%d.R%d: vref Training Summary                 :  0x%02x/%1d <----- 0x%02x/%1d -----> 0x%02x/%1d, range: %2d\n",
> +				      node, if_num, rankx, vvlo, vrlo,
> +				      final_vref_val,
> +				      final_vref_range + 1, vvhi, vrhi,
> +				      best_vref_val_count - 1);
> +
> +			} else {
> +				/*
> +				 * If nothing passed use the default vref
> +				 * value for this rank
> +				 */
> +				union cvmx_lmcx_modereg_params2 mp2;
> +
> +				mp2.u64 =
> +					lmc_rd(priv,
> +					       CVMX_LMCX_MODEREG_PARAMS2(if_num));
> +				final_vref_val = (mp2.u64 >>
> +						  (rankx * 10 + 3)) & 0x3f;
> +				final_vref_range = (mp2.u64 >>
> +						    (rankx * 10 + 9)) & 0x01;
> +
> +				debug("N%d.LMC%d.R%d: vref Using Default                    :    %2d <----- %2d (0x%02x) -----> %2d, range%1d\n",
> +				      node, if_num, rankx, final_vref_val,
> +				      final_vref_val, final_vref_val,
> +				      final_vref_val, final_vref_range + 1);
> +			}
> +		}
> +
> +		// allow override
> +		s = lookup_env(priv, "ddr%d_vref_val_%1d%1d",
> +			       if_num, !!(rankx & 2), !!(rankx & 1));
> +		if (s)
> +			final_vref_val = strtoul(s, NULL, 0);
> +
> +		set_vref(priv, if_num, rankx, final_vref_range, final_vref_val);
> +	}
> +}
> +
> +#define WL_MIN_NO_ERRORS_COUNT 3	// FIXME? three passes without errors
> +
> +static int errors __section(".data");
> +static int byte_delay[9] __section(".data");
> +static u64 bytemask __section(".data");
> +static int bytes_todo __section(".data");
> +static int no_errors_count __section(".data");
> +static u64 bad_bits[2] __section(".data");
> +#if ENABLE_SW_WLEVEL_UTILIZATION
> +static u64 sum_dram_dclk __section(".data");
> +static u64 sum_dram_ops __section(".data");
> +static u64 start_dram_dclk __section(".data");
> +static u64 stop_dram_dclk __section(".data");
> +static u64 start_dram_ops __section(".data");
> +static u64 stop_dram_ops __section(".data");
> +#endif
> +
> +static void lmc_sw_write_leveling_loop(struct ddr_priv *priv, int rankx)
> +{
> +	int delay;
> +	int b;
> +
> +	// write the current set of WL delays
> +	lmc_wr(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num), wl_rank.u64);
> +	wl_rank.u64 = lmc_rd(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num));
> +
> +	// do the test
> +	if (sw_wl_hw) {
> +		errors = run_best_hw_patterns(priv, if_num, rank_addr,
> +					      DBTRAIN_TEST, bad_bits);
> +		errors &= bytes_todo;	// keep only the ones we are still doing
> +	} else {
> +#if ENABLE_SW_WLEVEL_UTILIZATION
> +		start_dram_dclk = lmc_rd(priv, CVMX_LMCX_DCLK_CNT(if_num));
> +		start_dram_ops = lmc_rd(priv, CVMX_LMCX_OPS_CNT(if_num));
> +#endif
> +		errors = test_dram_byte64(priv, if_num, rank_addr, bytemask,
> +					  bad_bits);
> +
> +#if ENABLE_SW_WLEVEL_UTILIZATION
> +		stop_dram_dclk = lmc_rd(priv, CVMX_LMCX_DCLK_CNT(if_num));
> +		stop_dram_ops = lmc_rd(priv, CVMX_LMCX_OPS_CNT(if_num));
> +		sum_dram_dclk += stop_dram_dclk - start_dram_dclk;
> +		sum_dram_ops += stop_dram_ops - start_dram_ops;
> +#endif
> +	}
> +
> +	debug("WL pass1: test_dram_byte returned 0x%x\n", errors);
> +
> +	// remember, errors will not be returned for byte-lanes that have
> +	// maxxed out...
> +	if (errors == 0) {
> +		no_errors_count++;	// bump
> +		// bypass check/update completely
> +		if (no_errors_count > 1)
> +			return;	// to end of do-while
> +	} else {
> +		no_errors_count = 0;	// reset
> +	}
> +
> +	// check errors by byte
> +	for (b = 0; b < 9; ++b) {
> +		if (!(bytes_todo & (1 << b)))
> +			continue;
> +
> +		delay = byte_delay[b];
> +		// yes, an error in this byte lane
> +		if (errors & (1 << b)) {
> +			debug("        byte %d delay %2d Errors\n", b, delay);
> +			// since this byte had an error, we move to the next
> +			// delay value, unless done with it
> +			delay += 8;	// incr by 8 to do delay high-order bits
> +			if (delay < 32) {
> +				upd_wl_rank(&wl_rank, b, delay);
> +				debug("        byte %d delay %2d New\n",
> +				      b, delay);
> +				byte_delay[b] = delay;
> +			} else {
> +				// reached max delay, maybe really done with
> +				// this byte
> +#if SWL_TRY_HWL_ALT
> +				// consider an alt only for computed VREF and
> +				if (!measured_vref_flag &&
> +				    (hwl_alts[rankx].hwl_alt_mask & (1 << b))) {
> +					// if an alt exists...
> +					// just orig low-3 bits
> +					int bad_delay = delay & 0x6;
> +
> +					// yes, use it
> +					delay =	hwl_alts[rankx].hwl_alt_delay[b];
> +					// clear that flag
> +					hwl_alts[rankx].hwl_alt_mask &=
> +						~(1 << b);
> +					upd_wl_rank(&wl_rank, b, delay);
> +					byte_delay[b] = delay;
> +					debug("        byte %d delay %2d ALTERNATE\n",
> +					      b, delay);
> +					debug("N%d.LMC%d.R%d: SWL: Byte %d: %d FAIL, trying ALTERNATE %d\n",
> +					      node, if_num,
> +					      rankx, b, bad_delay, delay);
> +
> +				} else
> +#endif /* SWL_TRY_HWL_ALT */
> +				{
> +					unsigned int bits_bad;
> +
> +					if (b < 8) {
> +						// test no longer, remove from
> +						// byte mask
> +						bytemask &=
> +							~(0xffULL << (8 * b));
> +						bits_bad = (unsigned int)
> +							((bad_bits[0] >>
> +							  (8 * b)) & 0xffUL);
> +					} else {
> +						bits_bad = (unsigned int)
> +						    (bad_bits[1] & 0xffUL);
> +					}
> +
> +					// remove from bytes to do
> +					bytes_todo &= ~(1 << b);
> +					// make sure this is set for this case
> +					byte_test_status[b] = WL_ESTIMATED;
> +					debug("        byte %d delay %2d Exhausted\n",
> +					      b, delay);
> +					if (!measured_vref_flag) {
> +						// this is too noisy when doing
> +						// measured VREF
> +						debug("N%d.LMC%d.R%d: SWL: Byte %d (0x%02x): delay %d EXHAUSTED\n",
> +						      node, if_num, rankx,
> +						      b, bits_bad, delay);
> +					}
> +				}
> +			}
> +		} else {
> +			// no error, stay with current delay, but keep testing
> +			// it...
> +			debug("        byte %d delay %2d Passed\n", b, delay);
> +			byte_test_status[b] = WL_HARDWARE;	// change status
> +		}
> +	}			/* for (b = 0; b < 9; ++b) */
> +}
> +
> +static void sw_write_lvl_use_ecc(struct ddr_priv *priv, int rankx)
> +{
> +	int save_byte8 = wl_rank.s.byte8;
> +
> +	byte_test_status[8] = WL_HARDWARE;	/* H/W delay value */
> +
> +	if (save_byte8 != wl_rank.s.byte3 &&
> +	    save_byte8 != wl_rank.s.byte4) {
> +		int test_byte8 = save_byte8;
> +		int test_byte8_error;
> +		int byte8_error = 0x1f;
> +		int adder;
> +		int avg_bytes = divide_nint(wl_rank.s.byte3 + wl_rank.s.byte4,
> +					    2);
> +
> +		for (adder = 0; adder <= 32; adder += 8) {
> +			test_byte8_error = abs((adder + save_byte8) -
> +					       avg_bytes);
> +			if (test_byte8_error < byte8_error) {
> +				byte8_error = test_byte8_error;
> +				test_byte8 = save_byte8 + adder;
> +			}
> +		}
> +
> +#if SW_WL_CHECK_PATCH
> +		// only do the check if we are not using measured VREF
> +		if (!measured_vref_flag) {
> +			/* Use only even settings, rounding down... */
> +			test_byte8 &= ~1;
> +
> +			// do validity check on the calculated ECC delay value
> +			// this depends on the DIMM type
> +			if (spd_rdimm) {	// RDIMM
> +				// but not mini-RDIMM
> +				if (spd_dimm_type != 5) {
> +					// it can be > byte4, but should never
> +					// be > byte3
> +					if (test_byte8 > wl_rank.s.byte3) {
> +						/* say it is still estimated */
> +						byte_test_status[8] =
> +							WL_ESTIMATED;
> +					}
> +				}
> +			} else {	// UDIMM
> +				if (test_byte8 < wl_rank.s.byte3 ||
> +				    test_byte8 > wl_rank.s.byte4) {
> +					// should never be outside the
> +					// byte 3-4 range
> +					/* say it is still estimated */
> +					byte_test_status[8] = WL_ESTIMATED;
> +				}
> +			}
> +			/*
> +			 * Report whenever the calculation appears bad.
> +			 * This happens if some of the original values were off,
> +			 * or unexpected geometry from DIMM type, or custom
> +			 * circuitry (NIC225E, I am looking at you!).
> +			 * We will trust the calculated value, and depend on
> +			 * later testing to catch any instances when that
> +			 * value is truly bad.
> +			 */
> +			// ESTIMATED means there may be an issue
> +			if (byte_test_status[8] == WL_ESTIMATED) {
> +				debug("N%d.LMC%d.R%d: SWL: (%cDIMM): calculated ECC delay unexpected (%d/%d/%d)\n",
> +				      node, if_num, rankx,
> +				      (spd_rdimm ? 'R' : 'U'), wl_rank.s.byte4,
> +				      test_byte8, wl_rank.s.byte3);
> +				byte_test_status[8] = WL_HARDWARE;
> +			}
> +		}
> +#endif /* SW_WL_CHECK_PATCH */
> +		/* Use only even settings */
> +		wl_rank.s.byte8 = test_byte8 & ~1;
> +	}
> +
> +	if (wl_rank.s.byte8 != save_byte8) {
> +		/* Change the status if s/w adjusted the delay */
> +		byte_test_status[8] = WL_SOFTWARE;	/* Estimated delay */
> +	}
> +}
> +
> +static __maybe_unused void parallel_wl_block_delay(struct ddr_priv *priv,
> +						   int rankx)
> +{
> +	int errors;
> +	int byte_delay[8];
> +	int byte_passed[8];
> +	u64 bytemask;
> +	u64 bitmask;
> +	int wl_offset;
> +	int bytes_todo;
> +	int sw_wl_offset = 1;
> +	int delay;
> +	int b;
> +
> +	for (b = 0; b < 8; ++b)
> +		byte_passed[b] = 0;
> +
> +	bytes_todo = if_bytemask;
> +
> +	for (wl_offset = sw_wl_offset; wl_offset >= 0; --wl_offset) {
> +		debug("Starting wl_offset for-loop: %d\n", wl_offset);
> +
> +		bytemask = 0;
> +
> +		for (b = 0; b < 8; ++b) {
> +			byte_delay[b] = 0;
> +			// this does not contain fully passed bytes
> +			if (!(bytes_todo & (1 << b)))
> +				continue;
> +
> +			// reset across passes if not fully passed
> +			byte_passed[b] = 0;
> +			upd_wl_rank(&wl_rank, b, 0);	// all delays start at 0
> +			bitmask = ((!if_64b) && (b == 4)) ? 0x0f : 0xff;
> +			// set the bytes bits in the bytemask
> +			bytemask |= bitmask << (8 * b);
> +		}		/* for (b = 0; b < 8; ++b) */
> +
> +		// start a pass if there is any byte lane to test
> +		while (bytemask != 0) {
> +			debug("Starting bytemask while-loop: 0x%llx\n",
> +			      bytemask);
> +
> +			// write this set of WL delays
> +			lmc_wr(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num),
> +			       wl_rank.u64);
> +			wl_rank.u64 = lmc_rd(priv,
> +					     CVMX_LMCX_WLEVEL_RANKX(rankx,
> +								    if_num));
> +
> +			// do the test
> +			if (sw_wl_hw) {
> +				errors = run_best_hw_patterns(priv, if_num,
> +							      rank_addr,
> +							      DBTRAIN_TEST,
> +							      NULL) & 0xff;
> +			} else {
> +				errors = test_dram_byte64(priv, if_num,
> +							  rank_addr, bytemask,
> +							  NULL);
> +			}
> +
> +			debug("test_dram_byte returned 0x%x\n", errors);
> +
> +			// check errors by byte
> +			for (b = 0; b < 8; ++b) {
> +				if (!(bytes_todo & (1 << b)))
> +					continue;
> +
> +				delay = byte_delay[b];
> +				if (errors & (1 << b)) {	// yes, an error
> +					debug("        byte %d delay %2d Errors\n",
> +					      b, delay);
> +					byte_passed[b] = 0;
> +				} else {	// no error
> +					byte_passed[b] += 1;
> +					// Look for consecutive working settings
> +					if (byte_passed[b] == (1 + wl_offset)) {
> +						debug("        byte %d delay %2d FULLY Passed\n",
> +						      b, delay);
> +						if (wl_offset == 1) {
> +							byte_test_status[b] =
> +								WL_SOFTWARE;
> +						} else if (wl_offset == 0) {
> +							byte_test_status[b] =
> +								WL_SOFTWARE1;
> +						}
> +
> +						// test no longer, remove
> +						// from byte mask this pass
> +						bytemask &= ~(0xffULL <<
> +							      (8 * b));
> +						// remove completely from
> +						// concern
> +						bytes_todo &= ~(1 << b);
> +						// on to the next byte, bypass
> +						// delay updating!!
> +						continue;
> +					} else {
> +						debug("        byte %d delay %2d Passed\n",
> +						      b, delay);
> +					}
> +				}
> +
> +				// error or no, here we move to the next delay
> +				// value for this byte, unless done all delays
> +				// only a byte that has "fully passed" will
> +				// bypass around this,
> +				delay += 2;
> +				if (delay < 32) {
> +					upd_wl_rank(&wl_rank, b, delay);
> +					debug("        byte %d delay %2d New\n",
> +					      b, delay);
> +					byte_delay[b] = delay;
> +				} else {
> +					// reached max delay, done with this
> +					// byte
> +					debug("        byte %d delay %2d Exhausted\n",
> +					      b, delay);
> +					// test no longer, remove from byte
> +					// mask this pass
> +					bytemask &= ~(0xffULL << (8 * b));
> +				}
> +			}	/* for (b = 0; b < 8; ++b) */
> +			debug("End of for-loop: bytemask 0x%llx\n", bytemask);
> +		}		/* while (bytemask != 0) */
> +	}
> +
> +	for (b = 0; b < 8; ++b) {
> +		// any bytes left in bytes_todo did not pass
> +		if (bytes_todo & (1 << b)) {
> +			union cvmx_lmcx_rlevel_rankx lmc_rlevel_rank;
> +
> +			/*
> +			 * Last resort. Use Rlevel settings to estimate
> +			 * Wlevel if software write-leveling fails
> +			 */
> +			debug("Using RLEVEL as WLEVEL estimate for byte %d\n",
> +			      b);
> +			lmc_rlevel_rank.u64 =
> +				lmc_rd(priv, CVMX_LMCX_RLEVEL_RANKX(rankx,
> +								    if_num));
> +			rlevel_to_wlevel(&lmc_rlevel_rank, &wl_rank, b);
> +		}
> +	}			/* for (b = 0; b < 8; ++b) */
> +}
> +
> +static int lmc_sw_write_leveling(struct ddr_priv *priv)
> +{
> +	/* Try to determine/optimize write-level delays experimentally. */
> +	union cvmx_lmcx_wlevel_rankx wl_rank_hw_res;
> +	union cvmx_lmcx_config cfg;
> +	int rankx;
> +	int byte;
> +	char *s;
> +	int i;
> +
> +	int active_rank;
> +	int sw_wl_enable = 1;	/* FIX... Should be customizable. */
> +	int interfaces;
> +
> +	static const char * const wl_status_strings[] = {
> +		"(e)",
> +		"   ",
> +		"   ",
> +		"(1)"
> +	};
> +
> +	// FIXME: make HW-assist the default now?
> +	int sw_wl_hw_default = SW_WLEVEL_HW_DEFAULT;
> +	int dram_connection = c_cfg->dram_connection;
> +
> +	s = lookup_env(priv, "ddr_sw_wlevel_hw");
> +	if (s)
> +		sw_wl_hw_default = !!strtoul(s, NULL, 0);
> +	if (!if_64b)		// must use SW algo if 32-bit mode
> +		sw_wl_hw_default = 0;
> +
> +	// can never use hw-assist
> +	if (octeon_is_cpuid(OCTEON_CN78XX_PASS1_X))
> +		sw_wl_hw_default = 0;
> +
> +	s = lookup_env(priv, "ddr_software_wlevel");
> +	if (s)
> +		sw_wl_enable = strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr%d_dram_connection", if_num);
> +	if (s)
> +		dram_connection = !!strtoul(s, NULL, 0);
> +
> +	cvmx_rng_enable();
> +
> +#if SWL_WITH_HW_ALTS_CHOOSE_SW
> +	// Choose the SW algo for SWL if any HWL alternates were found
> +	// NOTE: we have to do this here, and for all, since HW-assist
> +	// including ECC requires ECC enable
> +	for (rankx = 0; rankx < dimm_count * 4; rankx++) {
> +		if (!sw_wl_enable)
> +			break;
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +
> +		// if we are doing HW-assist, and there are alternates,
> +		// switch to SW-algorithm for all
> +		if (sw_wl_hw_default && hwl_alts[rankx].hwl_alt_mask) {
> +			debug("N%d.LMC%d.R%d: Using SW algorithm for write-leveling this rank\n",
> +			      node, if_num, rankx);
> +			sw_wl_hw_default = 0;
> +			break;
> +		}
> +	}
> +#endif
> +
> +	/*
> +	 * Get the measured_vref setting from the config, check for an
> +	 * override...
> +	 */
> +	/* NOTE: measured_vref=1 (ON) means force use of MEASURED vref... */
> +	// NOTE: measured VREF can only be done for DDR4
> +	if (ddr_type == DDR4_DRAM) {
> +		measured_vref_flag = c_cfg->measured_vref;
> +		s = lookup_env(priv, "ddr_measured_vref");
> +		if (s)
> +			measured_vref_flag = !!strtoul(s, NULL, 0);
> +	} else {
> +		measured_vref_flag = 0;	// OFF for DDR3
> +	}
> +
> +	/*
> +	 * Ensure disabled ECC for DRAM tests using the SW algo, else leave
> +	 * it untouched
> +	 */
> +	if (!sw_wl_hw_default) {
> +		cfg.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +		cfg.cn78xx.ecc_ena = 0;
> +		lmc_wr(priv, CVMX_LMCX_CONFIG(if_num), cfg.u64);
> +	}
> +
> +	/*
> +	 * We need to track absolute rank number, as well as how many
> +	 * active ranks we have.  Two single rank DIMMs show up as
> +	 * ranks 0 and 2, but only 2 ranks are active.
> +	 */
> +	active_rank = 0;
> +
> +	interfaces = __builtin_popcount(if_mask);
> +
> +	for (rankx = 0; rankx < dimm_count * 4; rankx++) {
> +		final_vref_range = 0;
> +		start_vref_val = 0;
> +		computed_final_vref_val = -1;
> +		sw_wl_rank_status = WL_HARDWARE;
> +		sw_wl_failed = 0;
> +		sw_wl_hw = sw_wl_hw_default;
> +
> +		if (!sw_wl_enable)
> +			break;
> +
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +
> +		debug("N%d.LMC%d.R%d: Performing Software Write-Leveling %s\n",
> +		      node, if_num, rankx,
> +		      (sw_wl_hw) ? "with H/W assist" :
> +		      "with S/W algorithm");
> +
> +		if (ddr_type == DDR4_DRAM && num_ranks != 4) {
> +			// always compute when we can...
> +			computed_final_vref_val =
> +			    compute_vref_val(priv, if_num, rankx, dimm_count,
> +					     num_ranks, imp_val,
> +					     is_stacked_die, dram_connection);
> +
> +			// but only use it if allowed
> +			if (!measured_vref_flag) {
> +				// skip all the measured vref processing,
> +				// just the final setting
> +				start_vref_val = VREF_FINAL;
> +			}
> +		}
> +
> +		/* Save off the h/w wl results */
> +		wl_rank_hw_res.u64 = lmc_rd(priv,
> +					    CVMX_LMCX_WLEVEL_RANKX(rankx,
> +								   if_num));
> +
> +		vref_val_count = 0;
> +		vref_val_start = 0;
> +		best_vref_val_count = 0;
> +		best_vref_val_start = 0;
> +
> +		/* Loop one extra time using the Final vref value. */
> +		for (vref_val = start_vref_val; vref_val < VREF_LIMIT;
> +		     ++vref_val) {
> +			if (ddr_type == DDR4_DRAM)
> +				ddr4_vref_loop(priv, rankx);
> +
> +			/* Restore the saved value */
> +			wl_rank.u64 = wl_rank_hw_res.u64;
> +
> +			for (byte = 0; byte < 9; ++byte)
> +				byte_test_status[byte] = WL_ESTIMATED;
> +
> +			if (wl_mask_err == 0) {
> +				/*
> +				 * Determine address of DRAM to test for
> +				 * pass 1 of software write leveling.
> +				 */
> +				rank_addr = active_rank *
> +					(1ull << (pbank_lsb - bunk_enable +
> +						  (interfaces / 2)));
> +
> +				/*
> +				 * Adjust address for boot bus hole in memory
> +				 * map.
> +				 */
> +				if (rank_addr > 0x10000000)
> +					rank_addr += 0x10000000;
> +
> +				debug("N%d.LMC%d.R%d: Active Rank %d Address: 0x%llx\n",
> +				      node, if_num, rankx, active_rank,
> +				      rank_addr);
> +
> +				// start parallel write-leveling block for
> +				// delay high-order bits
> +				errors = 0;
> +				no_errors_count = 0;
> +#if ENABLE_SW_WLEVEL_UTILIZATION
> +				sum_dram_dclk = 0;
> +				sum_dram_ops = 0;
> +#endif
> +
> +				if (if_64b) {
> +					bytes_todo = (sw_wl_hw) ?
> +						if_bytemask : 0xFF;
> +					bytemask = ~0ULL;
> +				} else {
> +					// 32-bit, must be using SW algo,
> +					// only data bytes
> +					bytes_todo = 0x0f;
> +					bytemask = 0x00000000ffffffffULL;
> +				}
> +
> +				for (byte = 0; byte < 9; ++byte) {
> +					if (!(bytes_todo & (1 << byte))) {
> +						byte_delay[byte] = 0;
> +					} else {
> +						byte_delay[byte] =
> +						    get_wl_rank(&wl_rank, byte);
> +					}
> +				}	/* for (byte = 0; byte < 9; ++byte) */
> +
> +				do {
> +					lmc_sw_write_leveling_loop(priv, rankx);
> +				} while (no_errors_count <
> +					 WL_MIN_NO_ERRORS_COUNT);
> +
> +#if ENABLE_SW_WLEVEL_UTILIZATION
> +				if (!sw_wl_hw) {
> +					u64 percent_x10;
> +
> +					if (sum_dram_dclk == 0)
> +						sum_dram_dclk = 1;
> +					percent_x10 = sum_dram_ops * 1000 /
> +						sum_dram_dclk;
> +					debug("N%d.LMC%d.R%d: ops %llu, cycles %llu, used %llu.%llu%%\n",
> +					      node, if_num, rankx, sum_dram_ops,
> +					      sum_dram_dclk, percent_x10 / 10,
> +					      percent_x10 % 10);
> +				}
> +#endif
> +				if (errors) {
> +					debug("End WLEV_64 while loop: vref_val %d(0x%x), errors 0x%02x\n",
> +					      vref_val, vref_val, errors);
> +				}
> +				// end parallel write-leveling block for
> +				// delay high-order bits
> +
> +				// if we used HW-assist, we did the ECC byte
> +				// when approp.
> +				if (sw_wl_hw) {
> +					if (wl_print) {
> +						debug("N%d.LMC%d.R%d: HW-assisted SWL - ECC estimate not needed.\n",
> +						      node, if_num, rankx);
> +					}
> +					goto no_ecc_estimate;
> +				}
> +
> +				if ((if_bytemask & 0xff) == 0xff) {
> +					if (use_ecc) {
> +						sw_write_lvl_use_ecc(priv,
> +								     rankx);
> +					} else {
> +						/* H/W delay value */
> +						byte_test_status[8] =
> +							WL_HARDWARE;
> +						/* ECC is not used */
> +						wl_rank.s.byte8 =
> +							wl_rank.s.byte0;
> +					}
> +				} else {
> +					if (use_ecc) {
> +						/* Estimate the ECC byte dly */
> +						// add hi-order to b4
> +						wl_rank.s.byte4 |=
> +							(wl_rank.s.byte3 &
> +							 0x38);
> +						if ((wl_rank.s.byte4 & 0x06) <
> +						    (wl_rank.s.byte3 & 0x06)) {
> +							// must be next clock
> +							wl_rank.s.byte4 += 8;
> +						}
> +					} else {
> +						/* ECC is not used */
> +						wl_rank.s.byte4 =
> +							wl_rank.s.byte0;
> +					}
> +
> +					/*
> +					 * Change the status if s/w adjusted
> +					 * the delay
> +					 */
> +					/* Estimated delay */
> +					byte_test_status[4] = WL_SOFTWARE;
> +				}	/* if ((if_bytemask & 0xff) == 0xff) */
> +			}	/* if (wl_mask_err == 0) */
> +
> +no_ecc_estimate:
> +
> +			bytes_failed = 0;
> +			for (byte = 0; byte < 9; ++byte) {
> +				/* Don't accumulate errors for untested bytes */
> +				if (!(if_bytemask & (1 << byte)))
> +					continue;
> +				bytes_failed +=
> +				    (byte_test_status[byte] == WL_ESTIMATED);
> +			}
> +
> +			/* vref training loop is only used for DDR4  */
> +			if (ddr_type != DDR4_DRAM)
> +				break;
> +
> +			if (bytes_failed == 0) {
> +				if (vref_val_count == 0)
> +					vref_val_start = vref_val;
> +
> +				++vref_val_count;
> +				if (vref_val_count > best_vref_val_count) {
> +					best_vref_val_count = vref_val_count;
> +					best_vref_val_start = vref_val_start;
> +					debug("N%d.LMC%d.R%d: vref Training                    (%2d) :    0x%02x <----- ???? -----> 0x%02x\n",
> +					      node, if_num, rankx, vref_val,
> +					      best_vref_val_start,
> +					      best_vref_val_start +
> +					      best_vref_val_count - 1);
> +				}
> +			} else {
> +				vref_val_count = 0;
> +				debug("N%d.LMC%d.R%d: vref Training                    (%2d) :    failed\n",
> +				      node, if_num, rankx, vref_val);
> +			}
> +		}
> +
> +		/*
> +		 * Determine address of DRAM to test for software write
> +		 * leveling.
> +		 */
> +		rank_addr = active_rank * (1ull << (pbank_lsb - bunk_enable +
> +						    (interfaces / 2)));
> +		/* Adjust address for boot bus hole in memory map. */
> +		if (rank_addr > 0x10000000)
> +			rank_addr += 0x10000000;
> +
> +		debug("Rank Address: 0x%llx\n", rank_addr);
> +
> +		if (bytes_failed) {
> +#if !DISABLE_SW_WL_PASS_2
> +			debug("N%d.LMC%d.R%d: Starting SW Write-leveling pass 2\n",
> +			      node, if_num, rankx);
> +
> +			sw_wl_rank_status = WL_SOFTWARE;
> +
> +			/*
> +			 * If s/w fixups failed then retry using s/w
> +			 * write-leveling.
> +			 */
> +			if (wl_mask_err == 0) {
> +				/*
> +				 * h/w succeeded but s/w fixups failed.
> +				 * So retry s/w.
> +				 */
> +				debug("N%d.LMC%d.R%d: Retrying software Write-Leveling.\n",
> +				      node, if_num, rankx);
> +			}
> +
> +			// start parallel write-leveling block for delay
> +			// low-order bits
> +			parallel_wl_block_delay(priv, rankx);
> +
> +			if (use_ecc) {
> +				/*
> +				 * ECC byte has to be estimated. Take the
> +				 * average of the two surrounding bytes.
> +				 */
> +				int test_byte8 = divide_nint(wl_rank.s.byte3 +
> +							     wl_rank.s.byte4 +
> +							     2, 2);
> +				/* Use only even settings */
> +				wl_rank.s.byte8 = test_byte8 & ~1;
> +				/* Estimated delay */
> +				byte_test_status[8] = WL_ESTIMATED;
> +			} else {
> +				/* H/W delay value */
> +				byte_test_status[8] = WL_HARDWARE;
> +				/* ECC is not used */
> +				wl_rank.s.byte8 = wl_rank.s.byte0;
> +			}
> +
> +			/* Set delays for unused bytes to match byte 0. */
> +			for (byte = 0; byte < 8; ++byte) {
> +				if ((if_bytemask & (1 << byte)))
> +					continue;
> +				upd_wl_rank(&wl_rank, byte, wl_rank.s.byte0);
> +				byte_test_status[byte] = WL_SOFTWARE;
> +			}
> +#else /* !DISABLE_SW_WL_PASS_2 */
> +			// FIXME? the big hammer, did not even try SW WL pass2,
> +			// assume only chip reset will help
> +			debug("N%d.LMC%d.R%d: S/W write-leveling pass 1 failed\n",
> +			      node, if_num, rankx);
> +			sw_wl_failed = 1;
> +#endif /* !DISABLE_SW_WL_PASS_2 */
> +		} else {	/* if (bytes_failed) */
> +			// SW WL pass 1 was OK, write the settings
> +			lmc_wr(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num),
> +			       wl_rank.u64);
> +			wl_rank.u64 = lmc_rd(priv,
> +					     CVMX_LMCX_WLEVEL_RANKX(rankx,
> +								    if_num));
> +
> +#if SW_WL_CHECK_PATCH
> +			// do validity check on the delay values by running
> +			// the test 1 more time...
> +			// FIXME: we really need to check the ECC byte setting
> +			// here as well, so we need to enable ECC for this test!
> +			// if there are any errors, claim SW WL failure
> +			u64 datamask = (if_64b) ? 0xffffffffffffffffULL :
> +				0x00000000ffffffffULL;
> +			int errors;
> +
> +			// do the test
> +			if (sw_wl_hw) {
> +				errors = run_best_hw_patterns(priv, if_num,
> +							      rank_addr,
> +							      DBTRAIN_TEST,
> +							      NULL) & 0xff;
> +			} else {
> +				errors = test_dram_byte64(priv, if_num,
> +							  rank_addr, datamask,
> +							  NULL);
> +			}
> +
> +			if (errors) {
> +				debug("N%d.LMC%d.R%d: Wlevel Rank Final Test errors 0x%03x\n",
> +				      node, if_num, rankx, errors);
> +				sw_wl_failed = 1;
> +			}
> +#endif /* SW_WL_CHECK_PATCH */
> +		}		/* if (bytes_failed) */
> +
> +		// FIXME? dump the WL settings, so we get more of a clue
> +		// as to what happened where
> +		debug("N%d.LMC%d.R%d: Wlevel Rank %#4x, 0x%016llX  : %2d%3s %2d%3s %2d%3s %2d%3s %2d%3s %2d%3s %2d%3s %2d%3s %2d%3s %s\n",
> +		      node, if_num, rankx, wl_rank.s.status, wl_rank.u64,
> +		      wl_rank.s.byte8, wl_status_strings[byte_test_status[8]],
> +		      wl_rank.s.byte7, wl_status_strings[byte_test_status[7]],
> +		      wl_rank.s.byte6, wl_status_strings[byte_test_status[6]],
> +		      wl_rank.s.byte5, wl_status_strings[byte_test_status[5]],
> +		      wl_rank.s.byte4, wl_status_strings[byte_test_status[4]],
> +		      wl_rank.s.byte3, wl_status_strings[byte_test_status[3]],
> +		      wl_rank.s.byte2, wl_status_strings[byte_test_status[2]],
> +		      wl_rank.s.byte1, wl_status_strings[byte_test_status[1]],
> +		      wl_rank.s.byte0, wl_status_strings[byte_test_status[0]],
> +		      (sw_wl_rank_status == WL_HARDWARE) ? "" : "(s)");
> +
> +		// finally, check for fatal conditions: either chip reset
> +		// right here, or return error flag
> +		if ((ddr_type == DDR4_DRAM && best_vref_val_count == 0) ||
> +		    sw_wl_failed) {
> +			if (!ddr_disable_chip_reset) {	// do chip RESET
> +				printf("N%d.LMC%d.R%d: INFO: Short memory test indicates a retry is needed. Resetting node...\n",
> +				       node, if_num, rankx);
> +				mdelay(500);
> +				do_reset(NULL, 0, 0, NULL);
> +			} else {
> +				// return error flag so LMC init can be retried.
> +				debug("N%d.LMC%d.R%d: INFO: Short memory test indicates a retry is needed. Restarting LMC init...\n",
> +				      node, if_num, rankx);
> +				return -EAGAIN;	// 0 indicates restart possible.
> +			}
> +		}
> +		active_rank++;
> +	}
> +
> +	for (rankx = 0; rankx < dimm_count * 4; rankx++) {
> +		int parameter_set = 0;
> +		u64 value;
> +
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +
> +		wl_rank.u64 = lmc_rd(priv, CVMX_LMCX_WLEVEL_RANKX(rankx,
> +								  if_num));
> +
> +		for (i = 0; i < 9; ++i) {
> +			s = lookup_env(priv, "ddr%d_wlevel_rank%d_byte%d",
> +				       if_num, rankx, i);
> +			if (s) {
> +				parameter_set |= 1;
> +				value = strtoul(s, NULL, 0);
> +
> +				upd_wl_rank(&wl_rank, i, value);
> +			}
> +		}
> +
> +		s = lookup_env_ull(priv, "ddr%d_wlevel_rank%d", if_num, rankx);
> +		if (s) {
> +			parameter_set |= 1;
> +			value = strtoull(s, NULL, 0);
> +			wl_rank.u64 = value;
> +		}
> +
> +		if (parameter_set) {
> +			lmc_wr(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num),
> +			       wl_rank.u64);
> +			wl_rank.u64 =
> +			    lmc_rd(priv, CVMX_LMCX_WLEVEL_RANKX(rankx, if_num));
> +			display_wl(if_num, wl_rank, rankx);
> +		}
> +#if WLEXTRAS_PATCH
> +		// if there are unused entries to be filled
> +		if ((rank_mask & 0x0F) != 0x0F) {
> +			if (rankx < 3) {
> +				debug("N%d.LMC%d.R%d: checking for WLEVEL_RANK unused entries.\n",
> +				      node, if_num, rankx);
> +
> +				// if rank 0, write ranks 1 and 2 here if empty
> +				if (rankx == 0) {
> +					// check that rank 1 is empty
> +					if (!(rank_mask & (1 << 1))) {
> +						debug("N%d.LMC%d.R%d: writing WLEVEL_RANK unused entry R%d.\n",
> +						      node, if_num, rankx, 1);
> +						lmc_wr(priv,
> +						       CVMX_LMCX_WLEVEL_RANKX(1,
> +								if_num),
> +						       wl_rank.u64);
> +					}
> +
> +					// check that rank 2 is empty
> +					if (!(rank_mask & (1 << 2))) {
> +						debug("N%d.LMC%d.R%d: writing WLEVEL_RANK unused entry R%d.\n",
> +						      node, if_num, rankx, 2);
> +						lmc_wr(priv,
> +						       CVMX_LMCX_WLEVEL_RANKX(2,
> +								if_num),
> +						       wl_rank.u64);
> +					}
> +				}
> +
> +				// if rank 0, 1 or 2, write rank 3 here if empty
> +				// check that rank 3 is empty
> +				if (!(rank_mask & (1 << 3))) {
> +					debug("N%d.LMC%d.R%d: writing WLEVEL_RANK unused entry R%d.\n",
> +					      node, if_num, rankx, 3);
> +					lmc_wr(priv,
> +					       CVMX_LMCX_WLEVEL_RANKX(3,
> +								      if_num),
> +					       wl_rank.u64);
> +				}
> +			}
> +		}
> +#endif /* WLEXTRAS_PATCH */
> +	}
> +
> +	/* Enable 32-bit mode if required. */
> +	cfg.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	cfg.cn78xx.mode32b = (!if_64b);
> +	debug("%-45s : %d\n", "MODE32B", cfg.cn78xx.mode32b);
> +
> +	/* Restore the ECC configuration */
> +	if (!sw_wl_hw_default)
> +		cfg.cn78xx.ecc_ena = use_ecc;
> +
> +	lmc_wr(priv, CVMX_LMCX_CONFIG(if_num), cfg.u64);
> +
> +	return 0;
> +}
> +
> +static void lmc_dll(struct ddr_priv *priv)
> +{
> +#ifdef CAVIUM_ONLY
> +	union cvmx_lmcx_dll_ctl3 ddr_dll_ctl3;
> +	int setting[9];
> +	int i;
> +
> +	ddr_dll_ctl3.u64 = lmc_rd(priv, CVMX_LMCX_DLL_CTL3(if_num));
> +
> +	for (i = 0; i < 9; ++i) {
> +		SET_DDR_DLL_CTL3(dll90_byte_sel, ENCODE_DLL90_BYTE_SEL(i));
> +		lmc_wr(priv, CVMX_LMCX_DLL_CTL3(if_num), ddr_dll_ctl3.u64);
> +		lmc_rd(priv, CVMX_LMCX_DLL_CTL3(if_num));
> +		ddr_dll_ctl3.u64 = lmc_rd(priv, CVMX_LMCX_DLL_CTL3(if_num));
> +		setting[i] = GET_DDR_DLL_CTL3(dll90_setting);
> +		debug("%d. LMC%d_DLL_CTL3[%d] = %016llx %d\n", i, if_num,
> +		      GET_DDR_DLL_CTL3(dll90_byte_sel), ddr_dll_ctl3.u64,
> +		      setting[i]);
> +	}
> +
> +	debug("N%d.LMC%d: %-36s : %5d %5d %5d %5d %5d %5d %5d %5d %5d\n",
> +	      node, if_num, "DLL90 Setting 8:0",
> +	      setting[8], setting[7], setting[6], setting[5], setting[4],
> +	      setting[3], setting[2], setting[1], setting[0]);
> +#endif /* CAVIUM_ONLY */
> +
> +	process_custom_dll_offsets(priv, if_num, "ddr_dll_write_offset",
> +				   c_cfg->dll_write_offset,
> +				   "ddr%d_dll_write_offset_byte%d", 1);
> +	process_custom_dll_offsets(priv, if_num, "ddr_dll_read_offset",
> +				   c_cfg->dll_read_offset,
> +				   "ddr%d_dll_read_offset_byte%d", 2);
> +}
> +
> +#define SLOT_CTL_INCR(csr, chip, field, incr)				\
> +	csr.chip.field = (csr.chip.field < (64 - incr)) ?		\
> +		(csr.chip.field + incr) : 63
> +
> +#define INCR(csr, chip, field, incr)                                    \
> +	csr.chip.field = (csr.chip.field < (64 - incr)) ?		\
> +		(csr.chip.field + incr) : 63
> +
> +static void lmc_workaround_2(struct ddr_priv *priv)
> +{
> +	/* Workaround Errata 21063 */
> +	if (octeon_is_cpuid(OCTEON_CN78XX) ||
> +	    octeon_is_cpuid(OCTEON_CN70XX_PASS1_X)) {
> +		union cvmx_lmcx_slot_ctl0 slot_ctl0;
> +		union cvmx_lmcx_slot_ctl1 slot_ctl1;
> +		union cvmx_lmcx_slot_ctl2 slot_ctl2;
> +		union cvmx_lmcx_ext_config ext_cfg;
> +
> +		slot_ctl0.u64 = lmc_rd(priv, CVMX_LMCX_SLOT_CTL0(if_num));
> +		slot_ctl1.u64 = lmc_rd(priv, CVMX_LMCX_SLOT_CTL1(if_num));
> +		slot_ctl2.u64 = lmc_rd(priv, CVMX_LMCX_SLOT_CTL2(if_num));
> +
> +		ext_cfg.u64 = lmc_rd(priv, CVMX_LMCX_EXT_CONFIG(if_num));
> +
> +		/* When ext_cfg.s.read_ena_bprch is set add 1 */
> +		if (ext_cfg.s.read_ena_bprch) {
> +			SLOT_CTL_INCR(slot_ctl0, cn78xx, r2w_init, 1);
> +			SLOT_CTL_INCR(slot_ctl0, cn78xx, r2w_l_init, 1);
> +			SLOT_CTL_INCR(slot_ctl1, cn78xx, r2w_xrank_init, 1);
> +			SLOT_CTL_INCR(slot_ctl2, cn78xx, r2w_xdimm_init, 1);
> +		}
> +
> +		/* Always add 2 */
> +		SLOT_CTL_INCR(slot_ctl1, cn78xx, w2r_xrank_init, 2);
> +		SLOT_CTL_INCR(slot_ctl2, cn78xx, w2r_xdimm_init, 2);
> +
> +		lmc_wr(priv, CVMX_LMCX_SLOT_CTL0(if_num), slot_ctl0.u64);
> +		lmc_wr(priv, CVMX_LMCX_SLOT_CTL1(if_num), slot_ctl1.u64);
> +		lmc_wr(priv, CVMX_LMCX_SLOT_CTL2(if_num), slot_ctl2.u64);
> +	}
> +
> +	/* Workaround Errata 21216 */
> +	if (octeon_is_cpuid(OCTEON_CN78XX_PASS1_X) ||
> +	    octeon_is_cpuid(OCTEON_CN70XX_PASS1_X)) {
> +		union cvmx_lmcx_slot_ctl1 slot_ctl1;
> +		union cvmx_lmcx_slot_ctl2 slot_ctl2;
> +
> +		slot_ctl1.u64 = lmc_rd(priv, CVMX_LMCX_SLOT_CTL1(if_num));
> +		slot_ctl1.cn78xx.w2w_xrank_init =
> +		    max(10, (int)slot_ctl1.cn78xx.w2w_xrank_init);
> +		lmc_wr(priv, CVMX_LMCX_SLOT_CTL1(if_num), slot_ctl1.u64);
> +
> +		slot_ctl2.u64 = lmc_rd(priv, CVMX_LMCX_SLOT_CTL2(if_num));
> +		slot_ctl2.cn78xx.w2w_xdimm_init =
> +		    max(10, (int)slot_ctl2.cn78xx.w2w_xdimm_init);
> +		lmc_wr(priv, CVMX_LMCX_SLOT_CTL2(if_num), slot_ctl2.u64);
> +	}
> +}
> +
> +static void lmc_final(struct ddr_priv *priv)
> +{
> +	/*
> +	 * 4.8.11 Final LMC Initialization
> +	 *
> +	 * Early LMC initialization, LMC write-leveling, and LMC read-leveling
> +	 * must be completed prior to starting this final LMC initialization.
> +	 *
> +	 * LMC hardware updates the LMC(0)_SLOT_CTL0, LMC(0)_SLOT_CTL1,
> +	 * LMC(0)_SLOT_CTL2 CSRs with minimum values based on the selected
> +	 * readleveling and write-leveling settings. Software should not write
> +	 * the final LMC(0)_SLOT_CTL0, LMC(0)_SLOT_CTL1, and LMC(0)_SLOT_CTL2
> +	 * values until after the final read-leveling and write-leveling
> +	 * settings are written.
> +	 *
> +	 * Software must ensure the LMC(0)_SLOT_CTL0, LMC(0)_SLOT_CTL1, and
> +	 * LMC(0)_SLOT_CTL2 CSR values are appropriate for this step. These CSRs
> +	 * select the minimum gaps between read operations and write operations
> +	 * of various types.
> +	 *
> +	 * Software must not reduce the values in these CSR fields below the
> +	 * values previously selected by the LMC hardware (during write-leveling
> +	 * and read-leveling steps above).
> +	 *
> +	 * All sections in this chapter may be used to derive proper settings
> +	 * for these registers.
> +	 *
> +	 * For minimal read latency, L2C_CTL[EF_ENA,EF_CNT] should be programmed
> +	 * properly. This should be done prior to the first read.
> +	 */
> +
> +#ifdef ENABLE_SLOT_CTL_ACCESS
> +	union cvmx_lmcx_slot_ctl0 lmc_slot_ctl0;
> +	union cvmx_lmcx_slot_ctl1 lmc_slot_ctl1;
> +	union cvmx_lmcx_slot_ctl2 lmc_slot_ctl2;
> +	union cvmx_lmcx_slot_ctl3 lmc_slot_ctl3;
> +
> +	lmc_slot_ctl0.u64 = lmc_rd(priv, CVMX_LMCX_SLOT_CTL0(if_num));
> +	lmc_slot_ctl1.u64 = lmc_rd(priv, CVMX_LMCX_SLOT_CTL1(if_num));
> +	lmc_slot_ctl2.u64 = lmc_rd(priv, CVMX_LMCX_SLOT_CTL2(if_num));
> +	lmc_slot_ctl3.u64 = lmc_rd(priv, CVMX_LMCX_SLOT_CTL3(if_num));
> +
> +	debug("%-45s : 0x%016llx\n", "LMC_SLOT_CTL0", lmc_slot_ctl0.u64);
> +	debug("%-45s : 0x%016llx\n", "LMC_SLOT_CTL1", lmc_slot_ctl1.u64);
> +	debug("%-45s : 0x%016llx\n", "LMC_SLOT_CTL2", lmc_slot_ctl2.u64);
> +	debug("%-45s : 0x%016llx\n", "LMC_SLOT_CTL3", lmc_slot_ctl3.u64);
> +
> +	// for now, look only for SLOT_CTL1 envvar for override of contents
> +	s = lookup_env(priv, "ddr%d_slot_ctl1", if_num);
> +	if (s) {
> +		int slot_ctl1_incr = strtoul(s, NULL, 0);
> +		// validate the value
> +		if (slot_ctl1_incr < 1 || slot_ctl1_incr > 3) {
> +			printf("ddr%d_slot_ctl1 illegal value (%d); must be 1-3\n",
> +			       if_num, slot_ctl1_incr);
> +		} else {
> +			// modify all the SLOT_CTL1 fields by the increment,
> +			// for now...
> +			// but make sure the value will not overflow!!!
> +			INCR(lmc_slot_ctl1, s, r2r_xrank_init, slot_ctl1_incr);
> +			INCR(lmc_slot_ctl1, s, r2w_xrank_init, slot_ctl1_incr);
> +			INCR(lmc_slot_ctl1, s, w2r_xrank_init, slot_ctl1_incr);
> +			INCR(lmc_slot_ctl1, s, w2w_xrank_init, slot_ctl1_incr);
> +			lmc_wr(priv, CVMX_LMCX_SLOT_CTL1(if_num),
> +			       lmc_slot_ctl1.u64);
> +		}
> +	}
> +#endif /* ENABLE_SLOT_CTL_ACCESS */
> +
> +	/* Clear any residual ECC errors */
> +	int num_tads = 1;
> +	int tad;
> +	int num_mcis = 1;
> +	int mci;
> +
> +	if (octeon_is_cpuid(OCTEON_CN78XX)) {
> +		num_tads = 8;
> +		num_mcis = 4;
> +	} else if (octeon_is_cpuid(OCTEON_CN70XX)) {
> +		num_tads = 1;
> +		num_mcis = 1;
> +	} else if (octeon_is_cpuid(OCTEON_CN73XX) ||
> +		   octeon_is_cpuid(OCTEON_CNF75XX)) {
> +		num_tads = 4;
> +		num_mcis = 3;
> +	}
> +
> +	lmc_wr(priv, CVMX_LMCX_INT(if_num), -1ULL);
> +	lmc_rd(priv, CVMX_LMCX_INT(if_num));
> +
> +	for (tad = 0; tad < num_tads; tad++) {
> +		l2c_wr(priv, CVMX_L2C_TADX_INT(tad),
> +		       l2c_rd(priv, CVMX_L2C_TADX_INT(tad)));
> +		debug("%-45s : (%d) 0x%08llx\n", "CVMX_L2C_TAD_INT", tad,
> +		      l2c_rd(priv, CVMX_L2C_TADX_INT(tad)));
> +	}
> +
> +	for (mci = 0; mci < num_mcis; mci++) {
> +		l2c_wr(priv, CVMX_L2C_MCIX_INT(mci),
> +		       l2c_rd(priv, CVMX_L2C_MCIX_INT(mci)));
> +		debug("%-45s : (%d) 0x%08llx\n", "L2C_MCI_INT", mci,
> +		      l2c_rd(priv, CVMX_L2C_MCIX_INT(mci)));
> +	}
> +
> +	debug("%-45s : 0x%08llx\n", "LMC_INT",
> +	      lmc_rd(priv, CVMX_LMCX_INT(if_num)));
> +}
> +
> +static void lmc_scrambling(struct ddr_priv *priv)
> +{
> +	// Make sure scrambling is disabled during init...
> +	union cvmx_lmcx_control ctrl;
> +	union cvmx_lmcx_scramble_cfg0 lmc_scramble_cfg0;
> +	union cvmx_lmcx_scramble_cfg1 lmc_scramble_cfg1;
> +	union cvmx_lmcx_scramble_cfg2 lmc_scramble_cfg2;
> +	union cvmx_lmcx_ns_ctl lmc_ns_ctl;
> +	int use_scramble = 0;	// default OFF
> +	char *s;
> +
> +	ctrl.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +	lmc_scramble_cfg0.u64 = lmc_rd(priv, CVMX_LMCX_SCRAMBLE_CFG0(if_num));
> +	lmc_scramble_cfg1.u64 = lmc_rd(priv, CVMX_LMCX_SCRAMBLE_CFG1(if_num));
> +	lmc_scramble_cfg2.u64 = 0;	// quiet compiler
> +	if (!octeon_is_cpuid(OCTEON_CN78XX_PASS1_X)) {
> +		lmc_scramble_cfg2.u64 =
> +		    lmc_rd(priv, CVMX_LMCX_SCRAMBLE_CFG2(if_num));
> +	}
> +	lmc_ns_ctl.u64 = lmc_rd(priv, CVMX_LMCX_NS_CTL(if_num));
> +
> +	s = lookup_env_ull(priv, "ddr_use_scramble");
> +	if (s)
> +		use_scramble = simple_strtoull(s, NULL, 0);
> +
> +	/* Generate random values if scrambling is needed */
> +	if (use_scramble) {
> +		lmc_scramble_cfg0.u64 = cvmx_rng_get_random64();
> +		lmc_scramble_cfg1.u64 = cvmx_rng_get_random64();
> +		lmc_scramble_cfg2.u64 = cvmx_rng_get_random64();
> +		lmc_ns_ctl.s.ns_scramble_dis = 0;
> +		lmc_ns_ctl.s.adr_offset = 0;
> +		ctrl.s.scramble_ena = 1;
> +	}
> +
> +	s = lookup_env_ull(priv, "ddr_scramble_cfg0");
> +	if (s) {
> +		lmc_scramble_cfg0.u64 = simple_strtoull(s, NULL, 0);
> +		ctrl.s.scramble_ena = 1;
> +	}
> +	debug("%-45s : 0x%016llx\n", "LMC_SCRAMBLE_CFG0",
> +	      lmc_scramble_cfg0.u64);
> +
> +	lmc_wr(priv, CVMX_LMCX_SCRAMBLE_CFG0(if_num), lmc_scramble_cfg0.u64);
> +
> +	s = lookup_env_ull(priv, "ddr_scramble_cfg1");
> +	if (s) {
> +		lmc_scramble_cfg1.u64 = simple_strtoull(s, NULL, 0);
> +		ctrl.s.scramble_ena = 1;
> +	}
> +	debug("%-45s : 0x%016llx\n", "LMC_SCRAMBLE_CFG1",
> +	      lmc_scramble_cfg1.u64);
> +	lmc_wr(priv, CVMX_LMCX_SCRAMBLE_CFG1(if_num), lmc_scramble_cfg1.u64);
> +
> +	if (!octeon_is_cpuid(OCTEON_CN78XX_PASS1_X)) {
> +		s = lookup_env_ull(priv, "ddr_scramble_cfg2");
> +		if (s) {
> +			lmc_scramble_cfg2.u64 = simple_strtoull(s, NULL, 0);
> +			ctrl.s.scramble_ena = 1;
> +		}
> +		debug("%-45s : 0x%016llx\n", "LMC_SCRAMBLE_CFG2",
> +		      lmc_scramble_cfg1.u64);
> +		lmc_wr(priv, CVMX_LMCX_SCRAMBLE_CFG2(if_num),
> +		       lmc_scramble_cfg2.u64);
> +	}
> +
> +	s = lookup_env_ull(priv, "ddr_ns_ctl");
> +	if (s)
> +		lmc_ns_ctl.u64 = simple_strtoull(s, NULL, 0);
> +	debug("%-45s : 0x%016llx\n", "LMC_NS_CTL", lmc_ns_ctl.u64);
> +	lmc_wr(priv, CVMX_LMCX_NS_CTL(if_num), lmc_ns_ctl.u64);
> +
> +	lmc_wr(priv, CVMX_LMCX_CONTROL(if_num), ctrl.u64);
> +}
> +
> +struct rl_score {
> +	u64 setting;
> +	int score;
> +};
> +
> +static union cvmx_lmcx_rlevel_rankx rl_rank __section(".data");
> +static union cvmx_lmcx_rlevel_ctl rl_ctl __section(".data");
> +static unsigned char rodt_ctl __section(".data");
> +
> +static int rl_rodt_err __section(".data");
> +static unsigned char rtt_nom __section(".data");
> +static unsigned char rtt_idx __section(".data");
> +static char min_rtt_nom_idx __section(".data");
> +static char max_rtt_nom_idx __section(".data");
> +static char min_rodt_ctl __section(".data");
> +static char max_rodt_ctl __section(".data");
> +static int rl_dbg_loops __section(".data");
> +static unsigned char save_ddr2t __section(".data");
> +static int rl_samples __section(".data");
> +static char rl_compute __section(".data");
> +static char saved_ddr__ptune __section(".data");
> +static char saved_ddr__ntune __section(".data");
> +static char rl_comp_offs __section(".data");
> +static char saved_int_zqcs_dis __section(".data");
> +static int max_adj_rl_del_inc __section(".data");
> +static int print_nom_ohms __section(".data");
> +static int rl_print __section(".data");
> +
> +#ifdef ENABLE_HARDCODED_RLEVEL
> +static char part_number[21] __section(".data");
> +#endif /* ENABLE_HARDCODED_RLEVEL */
> +
> +#if PERFECT_BITMASK_COUNTING
> +struct perfect_counts {
> +	u16 count[9][32]; // 8+ECC by 64 values
> +	u32 mask[9];      // 8+ECC, bitmask of perfect delays
> +};
> +
> +static struct perfect_counts rank_perf[4] __section(".data");
> +static struct perfect_counts rodt_perfect_counts __section(".data");
> +static int pbm_lowsum_limit __section(".data");
> +// FIXME: PBM skip for RODT 240 and 34
> +static u32 pbm_rodt_skip __section(".data");
> +#endif /* PERFECT_BITMASK_COUNTING */
> +
> +// control rank majority processing
> +static int disable_rank_majority __section(".data");
> +
> +// default to mask 11b ODDs for DDR4 (except 73xx), else DISABLE
> +// for DDR3
> +static int enable_rldelay_bump __section(".data");
> +static int rldelay_bump_incr __section(".data");
> +static int disable_rlv_bump_this_byte __section(".data");
> +static u64 value_mask __section(".data");
> +
> +static struct rlevel_byte_data rl_byte[9] __section(".data");
> +static int sample_loops __section(".data");
> +static int max_samples __section(".data");
> +static int rl_rank_errors __section(".data");
> +static int rl_mask_err __section(".data");
> +static int rl_nonseq_err __section(".data");
> +static struct rlevel_bitmask rl_mask[9] __section(".data");
> +static int rl_best_rank_score __section(".data");
> +
> +static int rodt_row_skip_mask __section(".data");
> +
> +static void rodt_loop(struct ddr_priv *priv, int rankx, struct rl_score
> +		      rl_score[RTT_NOM_OHMS_COUNT][RODT_OHMS_COUNT][4])
> +{
> +	union cvmx_lmcx_comp_ctl2 cc2;
> +	const int rl_separate_ab = 1;
> +	int i;
> +
> +	rl_best_rank_score = DEFAULT_BEST_RANK_SCORE;
> +	rl_rodt_err = 0;
> +	cc2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +	cc2.cn78xx.rodt_ctl = rodt_ctl;
> +	lmc_wr(priv, CVMX_LMCX_COMP_CTL2(if_num), cc2.u64);
> +	cc2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +	udelay(1); /* Give it a little time to take affect */
> +	if (rl_print > 1) {
> +		debug("Read ODT_CTL                                  : 0x%x (%d ohms)\n",
> +		      cc2.cn78xx.rodt_ctl,
> +		      imp_val->rodt_ohms[cc2.cn78xx.rodt_ctl]);
> +	}
> +
> +	memset(rl_byte, 0, sizeof(rl_byte));
> +#if PERFECT_BITMASK_COUNTING
> +	memset(&rodt_perfect_counts, 0, sizeof(rodt_perfect_counts));
> +#endif /* PERFECT_BITMASK_COUNTING */
> +
> +	// when iter RODT is the target RODT, take more samples...
> +	max_samples = rl_samples;
> +	if (rodt_ctl == default_rodt_ctl)
> +		max_samples += rl_samples + 1;
> +
> +	for (sample_loops = 0; sample_loops < max_samples; sample_loops++) {
> +		int redoing_nonseq_errs = 0;
> +
> +		rl_mask_err = 0;
> +
> +		if (!(rl_separate_ab && spd_rdimm &&
> +		      ddr_type == DDR4_DRAM)) {
> +			/* Clear read-level delays */
> +			lmc_wr(priv, CVMX_LMCX_RLEVEL_RANKX(rankx, if_num), 0);
> +
> +			/* read-leveling */
> +			oct3_ddr3_seq(priv, 1 << rankx, if_num, 1);
> +
> +			do {
> +				rl_rank.u64 =
> +					lmc_rd(priv,
> +					       CVMX_LMCX_RLEVEL_RANKX(rankx,
> +								      if_num));
> +			} while (rl_rank.cn78xx.status != 3);
> +		}
> +
> +		rl_rank.u64 =
> +			lmc_rd(priv, CVMX_LMCX_RLEVEL_RANKX(rankx, if_num));
> +
> +		// start bitmask interpretation block
> +
> +		memset(rl_mask, 0, sizeof(rl_mask));
> +
> +		if (rl_separate_ab && spd_rdimm && ddr_type == DDR4_DRAM) {
> +			union cvmx_lmcx_rlevel_rankx rl_rank_aside;
> +			union cvmx_lmcx_modereg_params0 mp0;
> +
> +			/* A-side */
> +			mp0.u64 =
> +				lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num));
> +			mp0.s.mprloc = 0; /* MPR Page 0 Location 0 */
> +			lmc_wr(priv,
> +			       CVMX_LMCX_MODEREG_PARAMS0(if_num),
> +			       mp0.u64);
> +
> +			/* Clear read-level delays */
> +			lmc_wr(priv, CVMX_LMCX_RLEVEL_RANKX(rankx, if_num), 0);
> +
> +			/* read-leveling */
> +			oct3_ddr3_seq(priv, 1 << rankx, if_num, 1);
> +
> +			do {
> +				rl_rank.u64 =
> +					lmc_rd(priv,
> +					       CVMX_LMCX_RLEVEL_RANKX(rankx,
> +								      if_num));
> +			} while (rl_rank.cn78xx.status != 3);
> +
> +			rl_rank.u64 =
> +				lmc_rd(priv, CVMX_LMCX_RLEVEL_RANKX(rankx,
> +								    if_num));
> +
> +			rl_rank_aside.u64 = rl_rank.u64;
> +
> +			rl_mask[0].bm = lmc_ddr3_rl_dbg_read(priv, if_num, 0);
> +			rl_mask[1].bm = lmc_ddr3_rl_dbg_read(priv, if_num, 1);
> +			rl_mask[2].bm = lmc_ddr3_rl_dbg_read(priv, if_num, 2);
> +			rl_mask[3].bm = lmc_ddr3_rl_dbg_read(priv, if_num, 3);
> +			rl_mask[8].bm = lmc_ddr3_rl_dbg_read(priv, if_num, 8);
> +			/* A-side complete */
> +
> +			/* B-side */
> +			mp0.u64 =
> +				lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num));
> +			mp0.s.mprloc = 3; /* MPR Page 0 Location 3 */
> +			lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num),
> +			       mp0.u64);
> +
> +			/* Clear read-level delays */
> +			lmc_wr(priv, CVMX_LMCX_RLEVEL_RANKX(rankx, if_num), 0);
> +
> +			/* read-leveling */
> +			oct3_ddr3_seq(priv, 1 << rankx, if_num, 1);
> +
> +			do {
> +				rl_rank.u64 =
> +					lmc_rd(priv,
> +					       CVMX_LMCX_RLEVEL_RANKX(rankx,
> +								      if_num));
> +			} while (rl_rank.cn78xx.status != 3);
> +
> +			rl_rank.u64 =
> +				lmc_rd(priv, CVMX_LMCX_RLEVEL_RANKX(rankx,
> +								    if_num));
> +
> +			rl_mask[4].bm = lmc_ddr3_rl_dbg_read(priv, if_num, 4);
> +			rl_mask[5].bm = lmc_ddr3_rl_dbg_read(priv, if_num, 5);
> +			rl_mask[6].bm = lmc_ddr3_rl_dbg_read(priv, if_num, 6);
> +			rl_mask[7].bm = lmc_ddr3_rl_dbg_read(priv, if_num, 7);
> +			/* B-side complete */
> +
> +			upd_rl_rank(&rl_rank, 0, rl_rank_aside.s.byte0);
> +			upd_rl_rank(&rl_rank, 1, rl_rank_aside.s.byte1);
> +			upd_rl_rank(&rl_rank, 2, rl_rank_aside.s.byte2);
> +			upd_rl_rank(&rl_rank, 3, rl_rank_aside.s.byte3);
> +			/* ECC A-side */
> +			upd_rl_rank(&rl_rank, 8, rl_rank_aside.s.byte8);
> +
> +			mp0.u64 =
> +				lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num));
> +			mp0.s.mprloc = 0; /* MPR Page 0 Location 0 */
> +			lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS0(if_num),
> +			       mp0.u64);
> +		}
> +
> +		/*
> +		 * Evaluate the quality of the read-leveling delays from the
> +		 * bitmasks. Also save off a software computed read-leveling
> +		 * mask that may be used later to qualify the delay results
> +		 * from Octeon.
> +		 */
> +		for (i = 0; i < (8 + ecc_ena); ++i) {
> +			int bmerr;
> +
> +			if (!(if_bytemask & (1 << i)))
> +				continue;
> +			if (!(rl_separate_ab && spd_rdimm &&
> +			      ddr_type == DDR4_DRAM)) {
> +				rl_mask[i].bm =
> +					lmc_ddr3_rl_dbg_read(priv, if_num, i);
> +			}
> +			bmerr = validate_ddr3_rlevel_bitmask(&rl_mask[i],
> +							     ddr_type);
> +			rl_mask[i].errs = bmerr;
> +			rl_mask_err += bmerr;
> +#if PERFECT_BITMASK_COUNTING
> +			// count only the "perfect" bitmasks
> +			if (ddr_type == DDR4_DRAM && !bmerr) {
> +				int delay;
> +				// FIXME: for now, simple filtering:
> +				// do NOT count PBMs for RODTs in skip mask
> +				if ((1U << rodt_ctl) & pbm_rodt_skip)
> +					continue;
> +				// FIXME: could optimize this a bit?
> +				delay = get_rl_rank(&rl_rank, i);
> +				rank_perf[rankx].count[i][delay] += 1;
> +				rank_perf[rankx].mask[i] |=
> +					(1ULL << delay);
> +				rodt_perfect_counts.count[i][delay] += 1;
> +				rodt_perfect_counts.mask[i] |= (1ULL << delay);
> +			}
> +#endif /* PERFECT_BITMASK_COUNTING */
> +		}
> +
> +		/* Set delays for unused bytes to match byte 0. */
> +		for (i = 0; i < 9; ++i) {
> +			if (if_bytemask & (1 << i))
> +				continue;
> +			upd_rl_rank(&rl_rank, i, rl_rank.s.byte0);
> +		}
> +
> +		/*
> +		 * Save a copy of the byte delays in physical
> +		 * order for sequential evaluation.
> +		 */
> +		unpack_rlevel_settings(if_bytemask, ecc_ena, rl_byte, rl_rank);
> +
> +	redo_nonseq_errs:
> +
> +		rl_nonseq_err  = 0;
> +		if (!disable_sequential_delay_check) {
> +			for (i = 0; i < 9; ++i)
> +				rl_byte[i].sqerrs = 0;
> +
> +			if ((if_bytemask & 0xff) == 0xff) {
> +				/*
> +				 * Evaluate delay sequence across the whole
> +				 * range of bytes for standard dimms.
> +				 */
> +				/* 1=RDIMM, 5=Mini-RDIMM */
> +				if (spd_dimm_type == 1 || spd_dimm_type == 5) {
> +					int reg_adj_del = abs(rl_byte[4].delay -
> +							      rl_byte[5].delay);
> +
> +					/*
> +					 * Registered dimm topology routes
> +					 * from the center.
> +					 */
> +					rl_nonseq_err +=
> +						nonseq_del(rl_byte, 0,
> +							   3 + ecc_ena,
> +							   max_adj_rl_del_inc);
> +					rl_nonseq_err +=
> +						nonseq_del(rl_byte, 5,
> +							   7 + ecc_ena,
> +							   max_adj_rl_del_inc);
> +					// byte 5 sqerrs never gets cleared
> +					// for RDIMMs
> +					rl_byte[5].sqerrs = 0;
> +					if (reg_adj_del > 1) {
> +						/*
> +						 * Assess proximity of bytes on
> +						 * opposite sides of register
> +						 */
> +						rl_nonseq_err += (reg_adj_del -
> +								  1) *
> +							RLEVEL_ADJACENT_DELAY_ERROR;
> +						// update byte 5 error
> +						rl_byte[5].sqerrs +=
> +							(reg_adj_del - 1) *
> +							RLEVEL_ADJACENT_DELAY_ERROR;
> +					}
> +				}
> +
> +				/* 2=UDIMM, 6=Mini-UDIMM */
> +				if (spd_dimm_type == 2 || spd_dimm_type == 6) {
> +					/*
> +					 * Unbuffered dimm topology routes
> +					 * from end to end.
> +					 */
> +					rl_nonseq_err += nonseq_del(rl_byte, 0,
> +								    7 + ecc_ena,
> +								    max_adj_rl_del_inc);
> +				}
> +			} else {
> +				rl_nonseq_err += nonseq_del(rl_byte, 0,
> +							    3 + ecc_ena,
> +							    max_adj_rl_del_inc);
> +			}
> +		} /* if (! disable_sequential_delay_check) */
> +
> +		rl_rank_errors = rl_mask_err + rl_nonseq_err;
> +
> +		// print original sample here only if we are not really
> +		// averaging or picking best
> +		// also do not print if we were redoing the NONSEQ score
> +		// for using COMPUTED
> +		if (!redoing_nonseq_errs && rl_samples < 2) {
> +			if (rl_print > 1) {
> +				display_rl_bm(if_num, rankx, rl_mask, ecc_ena);
> +				display_rl_bm_scores(if_num, rankx, rl_mask,
> +						     ecc_ena);
> +				display_rl_seq_scores(if_num, rankx, rl_byte,
> +						      ecc_ena);
> +			}
> +			display_rl_with_score(if_num, rl_rank, rankx,
> +					      rl_rank_errors);
> +		}
> +
> +		if (rl_compute) {
> +			if (!redoing_nonseq_errs) {
> +				/* Recompute the delays based on the bitmask */
> +				for (i = 0; i < (8 + ecc_ena); ++i) {
> +					if (!(if_bytemask & (1 << i)))
> +						continue;
> +
> +					upd_rl_rank(&rl_rank, i,
> +						    compute_ddr3_rlevel_delay(
> +							    rl_mask[i].mstart,
> +							    rl_mask[i].width,
> +							    rl_ctl));
> +				}
> +
> +				/*
> +				 * Override the copy of byte delays with the
> +				 * computed results.
> +				 */
> +				unpack_rlevel_settings(if_bytemask, ecc_ena,
> +						       rl_byte, rl_rank);
> +
> +				redoing_nonseq_errs = 1;
> +				goto redo_nonseq_errs;
> +
> +			} else {
> +				/*
> +				 * now print this if already printed the
> +				 * original sample
> +				 */
> +				if (rl_samples < 2 || rl_print) {
> +					display_rl_with_computed(if_num,
> +								 rl_rank, rankx,
> +								 rl_rank_errors);
> +				}
> +			}
> +		} /* if (rl_compute) */
> +
> +		// end bitmask interpretation block
> +
> +		// if it is a better (lower) score, then  keep it
> +		if (rl_rank_errors < rl_best_rank_score) {
> +			rl_best_rank_score = rl_rank_errors;
> +
> +			// save the new best delays and best errors
> +			for (i = 0; i < (8 + ecc_ena); ++i) {
> +				rl_byte[i].best = rl_byte[i].delay;
> +				rl_byte[i].bestsq = rl_byte[i].sqerrs;
> +				// save bitmasks and their scores as well
> +				// xlate UNPACKED index to PACKED index to
> +				// get from rl_mask
> +				rl_byte[i].bm = rl_mask[XUP(i, !!ecc_ena)].bm;
> +				rl_byte[i].bmerrs =
> +					rl_mask[XUP(i, !!ecc_ena)].errs;
> +			}
> +		}
> +
> +		rl_rodt_err += rl_rank_errors;
> +	}
> +
> +	/* We recorded the best score across the averaging loops */
> +	rl_score[rtt_nom][rodt_ctl][rankx].score = rl_best_rank_score;
> +
> +	/*
> +	 * Restore the delays from the best fields that go with the best
> +	 * score
> +	 */
> +	for (i = 0; i < 9; ++i) {
> +		rl_byte[i].delay = rl_byte[i].best;
> +		rl_byte[i].sqerrs = rl_byte[i].bestsq;
> +	}
> +
> +	rl_rank.u64 = lmc_rd(priv, CVMX_LMCX_RLEVEL_RANKX(rankx, if_num));
> +
> +	pack_rlevel_settings(if_bytemask, ecc_ena, rl_byte, &rl_rank);
> +
> +	if (rl_samples > 1) {
> +		// restore the "best" bitmasks and their scores for printing
> +		for (i = 0; i < 9; ++i) {
> +			if ((if_bytemask & (1 << i)) == 0)
> +				continue;
> +			// xlate PACKED index to UNPACKED index to get from
> +			// rl_byte
> +			rl_mask[i].bm   = rl_byte[XPU(i, !!ecc_ena)].bm;
> +			rl_mask[i].errs = rl_byte[XPU(i, !!ecc_ena)].bmerrs;
> +		}
> +
> +		// maybe print bitmasks/scores here
> +		if (rl_print > 1) {
> +			display_rl_bm(if_num, rankx, rl_mask, ecc_ena);
> +			display_rl_bm_scores(if_num, rankx, rl_mask, ecc_ena);
> +			display_rl_seq_scores(if_num, rankx, rl_byte, ecc_ena);
> +
> +			display_rl_with_rodt(if_num, rl_rank, rankx,
> +					     rl_score[rtt_nom][rodt_ctl][rankx].score,
> +					     print_nom_ohms,
> +					     imp_val->rodt_ohms[rodt_ctl],
> +					     WITH_RODT_BESTSCORE);
> +
> +			debug("-----------\n");
> +		}
> +	}
> +
> +	rl_score[rtt_nom][rodt_ctl][rankx].setting = rl_rank.u64;
> +
> +#if PERFECT_BITMASK_COUNTING
> +	// print out the PBMs for the current RODT
> +	if (ddr_type == DDR4_DRAM && rl_print > 1) { // verbosity?
> +#if PRINT_PERFECT_COUNTS
> +		// FIXME: change verbosity level after debug complete...
> +
> +		for (i = 0; i < 9; i++) {
> +			u64 temp_mask;
> +			int num_values;
> +
> +			// FIXME: PBM skip for RODTs in mask
> +			if ((1U << rodt_ctl) & pbm_rodt_skip)
> +				continue;
> +
> +			temp_mask = rodt_perfect_counts.mask[i];
> +			num_values = __builtin_popcountll(temp_mask);
> +			i = __builtin_ffsll(temp_mask) - 1;
> +
> +			debug("N%d.LMC%d.R%d: PERFECT: RODT %3d: Byte %d: mask 0x%02llx (%d): ",
> +			      node, if_num, rankx,
> +			      imp_val->rodt_ohms[rodt_ctl],
> +			      i, temp_mask >> i, num_values);
> +
> +			while (temp_mask != 0) {
> +				i = __builtin_ffsll(temp_mask) - 1;
> +				debug("%2d(%2d) ", i,
> +				      rodt_perfect_counts.count[i][i]);
> +				temp_mask &= ~(1UL << i);
> +			} /* while (temp_mask != 0) */
> +			debug("\n");
> +		}
> +#endif
> +	}
> +#endif /* PERFECT_BITMASK_COUNTING */
> +}
> +
> +static void rank_major_loop(struct ddr_priv *priv, int rankx, struct rl_score
> +			    rl_score[RTT_NOM_OHMS_COUNT][RODT_OHMS_COUNT][4])
> +{
> +	/* Start with an arbitrarily high score */
> +	int best_rank_score = DEFAULT_BEST_RANK_SCORE;
> +	int best_rank_rtt_nom = 0;
> +	int best_rank_ctl = 0;
> +	int best_rank_ohms = 0;
> +	int best_rankx = 0;
> +	int dimm_rank_mask;
> +	int max_rank_score;
> +	union cvmx_lmcx_rlevel_rankx saved_rl_rank;
> +	int next_ohms;
> +	int orankx;
> +	int next_score = 0;
> +	int best_byte, new_byte, temp_byte, orig_best_byte;
> +	int rank_best_bytes[9];
> +	int byte_sh;
> +	int avg_byte;
> +	int avg_diff;
> +	int i;
> +
> +	if (!(rank_mask & (1 << rankx)))
> +		return;
> +
> +	// some of the rank-related loops below need to operate only on
> +	// the ranks of a single DIMM,
> +	// so create a mask for their use here
> +	if (num_ranks == 4) {
> +		dimm_rank_mask = rank_mask; // should be 1111
> +	} else {
> +		dimm_rank_mask = rank_mask & 3; // should be 01 or 11
> +		if (rankx >= 2) {
> +			// doing a rank on the second DIMM, should be
> +			// 0100 or 1100
> +			dimm_rank_mask <<= 2;
> +		}
> +	}
> +	debug("DIMM rank mask: 0x%x, rank mask: 0x%x, rankx: %d\n",
> +	      dimm_rank_mask, rank_mask, rankx);
> +
> +	// this is the start of the BEST ROW SCORE LOOP
> +
> +	for (rtt_idx = min_rtt_nom_idx; rtt_idx <= max_rtt_nom_idx; ++rtt_idx) {
> +		rtt_nom = imp_val->rtt_nom_table[rtt_idx];
> +
> +		debug("N%d.LMC%d.R%d: starting RTT_NOM %d (%d)\n",
> +		      node, if_num, rankx, rtt_nom,
> +		      imp_val->rtt_nom_ohms[rtt_nom]);
> +
> +		for (rodt_ctl = max_rodt_ctl; rodt_ctl >= min_rodt_ctl;
> +		     --rodt_ctl) {
> +			next_ohms = imp_val->rodt_ohms[rodt_ctl];
> +
> +			// skip RODT rows in mask, but *NOT* rows with too
> +			// high a score;
> +			// we will not use the skipped ones for printing or
> +			// evaluating, but we need to allow all the
> +			// non-skipped ones to be candidates for "best"
> +			if (((1 << rodt_ctl) & rodt_row_skip_mask) != 0) {
> +				debug("N%d.LMC%d.R%d: SKIPPING rodt:%d (%d) with rank_score:%d\n",
> +				      node, if_num, rankx, rodt_ctl,
> +				      next_ohms, next_score);
> +				continue;
> +			}
> +
> +			// this is ROFFIX-0528
> +			for (orankx = 0; orankx < dimm_count * 4; orankx++) {
> +				// stay on the same DIMM
> +				if (!(dimm_rank_mask & (1 << orankx)))
> +					continue;
> +
> +				next_score = rl_score[rtt_nom][rodt_ctl][orankx].score;
> +
> +				// always skip a higher score
> +				if (next_score > best_rank_score)
> +					continue;
> +
> +				// if scores are equal
> +				if (next_score == best_rank_score) {
> +					// always skip lower ohms
> +					if (next_ohms < best_rank_ohms)
> +						continue;
> +
> +					// if same ohms
> +					if (next_ohms == best_rank_ohms) {
> +						// always skip the other rank(s)
> +						if (orankx != rankx)
> +							continue;
> +					}
> +					// else next_ohms are greater,
> +					// always choose it
> +				}
> +				// else next_score is less than current best,
> +				// so always choose it
> +				debug("N%d.LMC%d.R%d: new best score: rank %d, rodt %d(%3d), new best %d, previous best %d(%d)\n",
> +				      node, if_num, rankx, orankx, rodt_ctl, next_ohms, next_score,
> +				      best_rank_score, best_rank_ohms);
> +				best_rank_score	    = next_score;
> +				best_rank_rtt_nom   = rtt_nom;
> +				//best_rank_nom_ohms  = rtt_nom_ohms;
> +				best_rank_ctl       = rodt_ctl;
> +				best_rank_ohms      = next_ohms;
> +				best_rankx          = orankx;
> +				rl_rank.u64 =
> +					rl_score[rtt_nom][rodt_ctl][orankx].setting;
> +			}
> +		}
> +	}
> +
> +	// this is the end of the BEST ROW SCORE LOOP
> +
> +	// DANGER, Will Robinson!! Abort now if we did not find a best
> +	// score at all...
> +	if (best_rank_score == DEFAULT_BEST_RANK_SCORE) {
> +		printf("N%d.LMC%d.R%d: WARNING: no best rank score found - resetting node...\n",
> +		       node, if_num, rankx);
> +		mdelay(500);
> +		do_reset(NULL, 0, 0, NULL);
> +	}
> +
> +	// FIXME: relative now, but still arbitrary...
> +	max_rank_score = best_rank_score;
> +	if (ddr_type == DDR4_DRAM) {
> +		// halve the range if 2 DIMMs unless they are single rank...
> +		max_rank_score += (MAX_RANK_SCORE_LIMIT / ((num_ranks > 1) ?
> +							   dimm_count : 1));
> +	} else {
> +		// Since DDR3 typically has a wider score range,
> +		// keep more of them always
> +		max_rank_score += MAX_RANK_SCORE_LIMIT;
> +	}
> +
> +	if (!ecc_ena) {
> +		/* ECC is not used */
> +		rl_rank.s.byte8 = rl_rank.s.byte0;
> +	}
> +
> +	// at the end, write the best row settings to the current rank
> +	lmc_wr(priv, CVMX_LMCX_RLEVEL_RANKX(rankx, if_num), rl_rank.u64);
> +	rl_rank.u64 = lmc_rd(priv, CVMX_LMCX_RLEVEL_RANKX(rankx, if_num));
> +
> +	saved_rl_rank.u64 = rl_rank.u64;
> +
> +	// this is the start of the PRINT LOOP
> +	int pass;
> +
> +	// for pass==0, print current rank, pass==1 print other rank(s)
> +	// this is done because we want to show each ranks RODT values
> +	// together, not interlaced
> +#if COUNT_RL_CANDIDATES
> +	// keep separates for ranks - pass=0 target rank, pass=1 other
> +	// rank on DIMM
> +	int mask_skipped[2] = {0, 0};
> +	int score_skipped[2] = {0, 0};
> +	int selected_rows[2] = {0, 0};
> +	int zero_scores[2] = {0, 0};
> +#endif /* COUNT_RL_CANDIDATES */
> +	for (pass = 0; pass < 2; pass++) {
> +		for (orankx = 0; orankx < dimm_count * 4; orankx++) {
> +			// stay on the same DIMM
> +			if (!(dimm_rank_mask & (1 << orankx)))
> +				continue;
> +
> +			if ((pass == 0 && orankx != rankx) ||
> +			    (pass != 0 && orankx == rankx))
> +				continue;
> +
> +			for (rtt_idx = min_rtt_nom_idx;
> +			     rtt_idx <= max_rtt_nom_idx; ++rtt_idx) {
> +				rtt_nom = imp_val->rtt_nom_table[rtt_idx];
> +				if (dyn_rtt_nom_mask == 0) {
> +					print_nom_ohms = -1;
> +				} else {
> +					print_nom_ohms =
> +						imp_val->rtt_nom_ohms[rtt_nom];
> +				}
> +
> +				// cycle through all the RODT values...
> +				for (rodt_ctl = max_rodt_ctl;
> +				     rodt_ctl >= min_rodt_ctl; --rodt_ctl) {
> +					union cvmx_lmcx_rlevel_rankx
> +						temp_rl_rank;
> +					int temp_score =
> +						rl_score[rtt_nom][rodt_ctl][orankx].score;
> +					int skip_row;
> +
> +					temp_rl_rank.u64 =
> +						rl_score[rtt_nom][rodt_ctl][orankx].setting;
> +
> +					// skip RODT rows in mask, or rows
> +					// with too high a score;
> +					// we will not use them for printing
> +					// or evaluating...
> +#if COUNT_RL_CANDIDATES
> +					if ((1 << rodt_ctl) &
> +					    rodt_row_skip_mask) {
> +						skip_row = WITH_RODT_SKIPPING;
> +						++mask_skipped[pass];
> +					} else if (temp_score >
> +						   max_rank_score) {
> +						skip_row = WITH_RODT_SKIPPING;
> +						++score_skipped[pass];
> +					} else {
> +						skip_row = WITH_RODT_BLANK;
> +						++selected_rows[pass];
> +						if (temp_score == 0)
> +							++zero_scores[pass];
> +					}
> +
> +#else /* COUNT_RL_CANDIDATES */
> +					skip_row = (((1 << rodt_ctl) &
> +						     rodt_row_skip_mask) ||
> +						    (temp_score >
> +						     max_rank_score))
> +						? WITH_RODT_SKIPPING :
> +						WITH_RODT_BLANK;
> +#endif
> +					// identify and print the BEST ROW
> +					// when it comes up
> +					if (skip_row == WITH_RODT_BLANK &&
> +					    best_rankx == orankx &&
> +					    best_rank_rtt_nom == rtt_nom &&
> +					    best_rank_ctl == rodt_ctl)
> +						skip_row = WITH_RODT_BESTROW;
> +
> +					if (rl_print) {
> +						display_rl_with_rodt(if_num,
> +								     temp_rl_rank, orankx, temp_score,
> +								     print_nom_ohms,
> +								     imp_val->rodt_ohms[rodt_ctl],
> +								     skip_row);
> +					}
> +				}
> +			}
> +		}
> +	}
> +#if COUNT_RL_CANDIDATES
> +	debug("N%d.LMC%d.R%d: RLROWS: selected %d+%d, zero_scores %d+%d, mask_skipped %d+%d, score_skipped %d+%d\n",
> +	      node, if_num, rankx, selected_rows[0], selected_rows[1],
> +	      zero_scores[0], zero_scores[1], mask_skipped[0], mask_skipped[1],
> +	      score_skipped[0], score_skipped[1]);
> +#endif /* COUNT_RL_CANDIDATES */
> +	// this is the end of the PRINT LOOP
> +
> +	// now evaluate which bytes need adjusting
> +	// collect the new byte values; first init with current best for
> +	// neighbor use
> +	for (i = 0, byte_sh = 0; i < 8 + ecc_ena; i++, byte_sh += 6) {
> +		rank_best_bytes[i] = (int)(rl_rank.u64 >> byte_sh) &
> +			RLEVEL_BYTE_MSK;
> +	}
> +
> +	// this is the start of the BEST BYTE LOOP
> +
> +	for (i = 0, byte_sh = 0; i < 8 + ecc_ena; i++, byte_sh += 6) {
> +		int sum = 0, count = 0;
> +		int count_less = 0, count_same = 0, count_more = 0;
> +#if FAILSAFE_CHECK
> +		int count_byte; // save the value we counted around
> +#endif /* FAILSAFE_CHECK */
> +		// for rank majority use
> +		int rank_less = 0, rank_same = 0, rank_more = 0;
> +		int neighbor;
> +		int neigh_byte;
> +
> +		best_byte = rank_best_bytes[i];
> +		orig_best_byte = rank_best_bytes[i];
> +
> +		// this is the start of the BEST BYTE AVERAGING LOOP
> +
> +		// validate the initial "best" byte by looking at the
> +		// average of the unskipped byte-column entries
> +		// we want to do this before we go further, so we can
> +		// try to start with a better initial value
> +		// this is the so-called "BESTBUY" patch set
> +
> +		for (rtt_idx = min_rtt_nom_idx; rtt_idx <= max_rtt_nom_idx;
> +		     ++rtt_idx) {
> +			rtt_nom = imp_val->rtt_nom_table[rtt_idx];
> +
> +			for (rodt_ctl = max_rodt_ctl; rodt_ctl >= min_rodt_ctl;
> +			     --rodt_ctl) {
> +				union cvmx_lmcx_rlevel_rankx temp_rl_rank;
> +				int temp_score;
> +
> +				// average over all the ranks
> +				for (orankx = 0; orankx < dimm_count * 4;
> +				     orankx++) {
> +					// stay on the same DIMM
> +					if (!(dimm_rank_mask & (1 << orankx)))
> +						continue;
> +
> +					temp_score =
> +						rl_score[rtt_nom][rodt_ctl][orankx].score;
> +					// skip RODT rows in mask, or rows with
> +					// too high a score;
> +					// we will not use them for printing or
> +					// evaluating...
> +
> +					if (!((1 << rodt_ctl) &
> +					      rodt_row_skip_mask) &&
> +					    temp_score <= max_rank_score) {
> +						temp_rl_rank.u64 =
> +							rl_score[rtt_nom][rodt_ctl][orankx].setting;
> +						temp_byte =
> +							(int)(temp_rl_rank.u64 >> byte_sh) &
> +							RLEVEL_BYTE_MSK;
> +						sum += temp_byte;
> +						count++;
> +					}
> +				}
> +			}
> +		}
> +
> +		// this is the end of the BEST BYTE AVERAGING LOOP
> +
> +		// FIXME: validate count and sum??
> +		avg_byte = (int)divide_nint(sum, count);
> +		avg_diff = best_byte - avg_byte;
> +		new_byte = best_byte;
> +		if (avg_diff != 0) {
> +			// bump best up/dn by 1, not necessarily all the
> +			// way to avg
> +			new_byte = best_byte + ((avg_diff > 0) ? -1 : 1);
> +		}
> +
> +		if (rl_print) {
> +			debug("N%d.LMC%d.R%d: START:   Byte %d: best %d is different by %d from average %d, using %d.\n",
> +			      node, if_num, rankx,
> +			      i, best_byte, avg_diff, avg_byte, new_byte);
> +		}
> +		best_byte = new_byte;
> +#if FAILSAFE_CHECK
> +		count_byte = new_byte; // save the value we will count around
> +#endif /* FAILSAFE_CHECK */
> +
> +		// At this point best_byte is either:
> +		// 1. the original byte-column value from the best scoring
> +		//    RODT row, OR
> +		// 2. that value bumped toward the average of all the
> +		//    byte-column values
> +		//
> +		// best_byte will not change from here on...
> +
> +		// this is the start of the BEST BYTE COUNTING LOOP
> +
> +		// NOTE: we do this next loop separately from above, because
> +		// we count relative to "best_byte"
> +		// which may have been modified by the above averaging
> +		// operation...
> +
> +		for (rtt_idx = min_rtt_nom_idx; rtt_idx <= max_rtt_nom_idx;
> +		     ++rtt_idx) {
> +			rtt_nom = imp_val->rtt_nom_table[rtt_idx];
> +
> +			for (rodt_ctl = max_rodt_ctl; rodt_ctl >= min_rodt_ctl;
> +			     --rodt_ctl) {
> +				union cvmx_lmcx_rlevel_rankx temp_rl_rank;
> +				int temp_score;
> +
> +				for (orankx = 0; orankx < dimm_count * 4;
> +				     orankx++) { // count over all the ranks
> +					// stay on the same DIMM
> +					if (!(dimm_rank_mask & (1 << orankx)))
> +						continue;
> +
> +					temp_score =
> +						rl_score[rtt_nom][rodt_ctl][orankx].score;
> +					// skip RODT rows in mask, or rows
> +					// with too high a score;
> +					// we will not use them for printing
> +					// or evaluating...
> +					if (((1 << rodt_ctl) &
> +					     rodt_row_skip_mask) ||
> +					    temp_score > max_rank_score)
> +						continue;
> +
> +					temp_rl_rank.u64 =
> +						rl_score[rtt_nom][rodt_ctl][orankx].setting;
> +					temp_byte = (temp_rl_rank.u64 >>
> +						     byte_sh) & RLEVEL_BYTE_MSK;
> +
> +					if (temp_byte == 0)
> +						;  // do not count it if illegal
> +					else if (temp_byte == best_byte)
> +						count_same++;
> +					else if (temp_byte == best_byte - 1)
> +						count_less++;
> +					else if (temp_byte == best_byte + 1)
> +						count_more++;
> +					// else do not count anything more
> +					// than 1 away from the best
> +
> +					// no rank counting if disabled
> +					if (disable_rank_majority)
> +						continue;
> +
> +					// FIXME? count is relative to
> +					// best_byte; should it be rank-based?
> +					// rank counts only on main rank
> +					if (orankx != rankx)
> +						continue;
> +					else if (temp_byte == best_byte)
> +						rank_same++;
> +					else if (temp_byte == best_byte - 1)
> +						rank_less++;
> +					else if (temp_byte == best_byte + 1)
> +						rank_more++;
> +				}
> +			}
> +		}
> +
> +		if (rl_print) {
> +			debug("N%d.LMC%d.R%d: COUNT:   Byte %d: orig %d now %d, more %d same %d less %d (%d/%d/%d)\n",
> +			      node, if_num, rankx,
> +			      i, orig_best_byte, best_byte,
> +			      count_more, count_same, count_less,
> +			      rank_more, rank_same, rank_less);
> +		}
> +
> +		// this is the end of the BEST BYTE COUNTING LOOP
> +
> +		// choose the new byte value
> +		// we need to check that there is no gap greater than 2
> +		// between adjacent bytes (adjacency depends on DIMM type)
> +		// use the neighbor value to help decide
> +		// initially, the rank_best_bytes[] will contain values from
> +		// the chosen lowest score rank
> +		new_byte = 0;
> +
> +		// neighbor is index-1 unless we are index 0 or index 8 (ECC)
> +		neighbor = (i == 8) ? 3 : ((i == 0) ? 1 : i - 1);
> +		neigh_byte = rank_best_bytes[neighbor];
> +
> +		// can go up or down or stay the same, so look at a numeric
> +		// average to help
> +		new_byte = (int)divide_nint(((count_more * (best_byte + 1)) +
> +					     (count_same * (best_byte + 0)) +
> +					     (count_less * (best_byte - 1))),
> +					    max(1, (count_more + count_same +
> +						    count_less)));
> +
> +		// use neighbor to help choose with average
> +		if (i > 0 && (abs(neigh_byte - new_byte) > 2) &&
> +		    !disable_sequential_delay_check) {
> +			// but not for byte 0
> +			int avg_pick = new_byte;
> +
> +			if ((new_byte - best_byte) != 0) {
> +				// back to best, average did not get better
> +				new_byte = best_byte;
> +			} else {
> +				// avg was the same, still too far, now move
> +				// it towards the neighbor
> +				new_byte += (neigh_byte > new_byte) ? 1 : -1;
> +			}
> +
> +			if (rl_print) {
> +				debug("N%d.LMC%d.R%d: AVERAGE: Byte %d: neighbor %d too different %d from average %d, picking %d.\n",
> +				      node, if_num, rankx,
> +				      i, neighbor, neigh_byte, avg_pick,
> +				      new_byte);
> +			}
> +		} else {
> +			// NOTE:
> +			// For now, we let the neighbor processing above trump
> +			// the new simple majority processing here.
> +			// This is mostly because we have seen no smoking gun
> +			// for a neighbor bad choice (yet?).
> +			// Also note that we will ALWAYS be using byte 0
> +			// majority, because of the if clause above.
> +
> +			// majority is dependent on the counts, which are
> +			// relative to best_byte, so start there
> +			int maj_byte = best_byte;
> +			int rank_maj;
> +			int rank_sum;
> +
> +			if (count_more > count_same &&
> +			    count_more > count_less) {
> +				maj_byte++;
> +			} else if (count_less > count_same &&
> +				   count_less > count_more) {
> +				maj_byte--;
> +			}
> +
> +			if (maj_byte != new_byte) {
> +				// print only when majority choice is
> +				// different from average
> +				if (rl_print) {
> +					debug("N%d.LMC%d.R%d: MAJORTY: Byte %d: picking majority of %d over average %d.\n",
> +					      node, if_num, rankx, i, maj_byte,
> +					      new_byte);
> +				}
> +				new_byte = maj_byte;
> +			} else {
> +				if (rl_print) {
> +					debug("N%d.LMC%d.R%d: AVERAGE: Byte %d: picking average of %d.\n",
> +					      node, if_num, rankx, i, new_byte);
> +				}
> +			}
> +
> +			if (!disable_rank_majority) {
> +				// rank majority is dependent on the rank
> +				// counts, which are relative to best_byte,
> +				// so start there, and adjust according to the
> +				// rank counts majority
> +				rank_maj = best_byte;
> +				if (rank_more > rank_same &&
> +				    rank_more > rank_less) {
> +					rank_maj++;
> +				} else if (rank_less > rank_same &&
> +					   rank_less > rank_more) {
> +					rank_maj--;
> +				}
> +				rank_sum = rank_more + rank_same + rank_less;
> +
> +				// now, let rank majority possibly rule over
> +				// the current new_byte however we got it
> +				if (rank_maj != new_byte) { // only if different
> +					// Here is where we decide whether to
> +					// completely apply RANK_MAJORITY or not
> +					// ignore if less than
> +					if (rank_maj < new_byte) {
> +						if (rl_print) {
> +							debug("N%d.LMC%d.R%d: RANKMAJ: Byte %d: LESS: NOT using %d over %d.\n",
> +							      node, if_num,
> +							      rankx, i,
> +							      rank_maj,
> +							      new_byte);
> +						}
> +					} else {
> +						// For the moment, we do it
> +						// ONLY when running 2-slot
> +						// configs
> +						//  OR when rank_sum is big
> +						// enough
> +						if (dimm_count > 1 ||
> +						    rank_sum > 2) {
> +							// print only when rank
> +							// majority choice is
> +							// selected
> +							if (rl_print) {
> +								debug("N%d.LMC%d.R%d: RANKMAJ: Byte %d: picking %d over %d.\n",
> +								      node,
> +								      if_num,
> +								      rankx,
> +								      i,
> +								      rank_maj,
> +								      new_byte);
> +							}
> +							new_byte = rank_maj;
> +						} else {
> +							// FIXME: print some
> +							// info when we could
> +							// have chosen RANKMAJ
> +							// but did not
> +							if (rl_print) {
> +								debug("N%d.LMC%d.R%d: RANKMAJ: Byte %d: NOT using %d over %d (best=%d,sum=%d).\n",
> +								      node,
> +								      if_num,
> +								      rankx,
> +								      i,
> +								      rank_maj,
> +								      new_byte,
> +								      best_byte,
> +								      rank_sum);
> +							}
> +						}
> +					}
> +				}
> +			} /* if (!disable_rank_majority) */
> +		}
> +#if FAILSAFE_CHECK
> +		// one last check:
> +		// if new_byte is still count_byte, BUT there was no count
> +		// for that value, DO SOMETHING!!!
> +		// FIXME: go back to original best byte from the best row
> +		if (new_byte == count_byte && count_same == 0) {
> +			new_byte = orig_best_byte;
> +			if (rl_print) {
> +				debug("N%d.LMC%d.R%d: FAILSAF: Byte %d: going back to original %d.\n",
> +				      node, if_num, rankx, i, new_byte);
> +			}
> +		}
> +#endif /* FAILSAFE_CHECK */
> +#if PERFECT_BITMASK_COUNTING
> +		// Look at counts for "perfect" bitmasks (PBMs) if we had
> +		// any for this byte-lane.
> +		// Remember, we only counted for DDR4, so zero means none
> +		// or DDR3, and we bypass this...
> +		value_mask = rank_perf[rankx].mask[i];
> +		disable_rlv_bump_this_byte = 0;
> +
> +		if (value_mask != 0 && rl_ctl.cn78xx.offset == 1) {
> +			int i, delay_count, delay_max = 0, del_val = 0;
> +			int num_values = __builtin_popcountll(value_mask);
> +			int sum_counts = 0;
> +			u64 temp_mask = value_mask;
> +
> +			disable_rlv_bump_this_byte = 1;
> +			i = __builtin_ffsll(temp_mask) - 1;
> +			if (rl_print)
> +				debug("N%d.LMC%d.R%d: PERFECT: Byte %d: OFF1: mask 0x%02llx (%d): ",
> +				      node, if_num, rankx, i, value_mask >> i,
> +				      num_values);
> +
> +			while (temp_mask != 0) {
> +				i = __builtin_ffsll(temp_mask) - 1;
> +				delay_count = rank_perf[rankx].count[i][i];
> +				sum_counts += delay_count;
> +				if (rl_print)
> +					debug("%2d(%2d) ", i, delay_count);
> +				if (delay_count >= delay_max) {
> +					delay_max = delay_count;
> +					del_val = i;
> +				}
> +				temp_mask &= ~(1UL << i);
> +			} /* while (temp_mask != 0) */
> +
> +			// if sum_counts is small, just use NEW_BYTE
> +			if (sum_counts < pbm_lowsum_limit) {
> +				if (rl_print)
> +					debug(": LOWSUM (%2d), choose ORIG ",
> +					      sum_counts);
> +				del_val = new_byte;
> +				delay_max = rank_perf[rankx].count[i][del_val];
> +			}
> +
> +			// finish printing here...
> +			if (rl_print) {
> +				debug(": USING %2d (%2d) D%d\n", del_val,
> +				      delay_max, disable_rlv_bump_this_byte);
> +			}
> +
> +			new_byte = del_val; // override with best PBM choice
> +
> +		} else if ((value_mask != 0) && (rl_ctl.cn78xx.offset == 2)) {
> +			//                        if (value_mask != 0) {
> +			int i, delay_count, del_val;
> +			int num_values = __builtin_popcountll(value_mask);
> +			int sum_counts = 0;
> +			u64 temp_mask = value_mask;
> +
> +			i = __builtin_ffsll(temp_mask) - 1;
> +			if (rl_print)
> +				debug("N%d.LMC%d.R%d: PERFECT: Byte %d: mask 0x%02llx (%d): ",
> +				      node, if_num, rankx, i, value_mask >> i,
> +				      num_values);
> +			while (temp_mask != 0) {
> +				i = __builtin_ffsll(temp_mask) - 1;
> +				delay_count = rank_perf[rankx].count[i][i];
> +				sum_counts += delay_count;
> +				if (rl_print)
> +					debug("%2d(%2d) ", i, delay_count);
> +				temp_mask &= ~(1UL << i);
> +			} /* while (temp_mask != 0) */
> +
> +			del_val = __builtin_ffsll(value_mask) - 1;
> +			delay_count =
> +				rank_perf[rankx].count[i][del_val];
> +
> +			// overkill, normally only 1-4 bits
> +			i = (value_mask >> del_val) & 0x1F;
> +
> +			// if sum_counts is small, treat as special and use
> +			// NEW_BYTE
> +			if (sum_counts < pbm_lowsum_limit) {
> +				if (rl_print)
> +					debug(": LOWSUM (%2d), choose ORIG",
> +					      sum_counts);
> +				i = 99; // SPECIAL case...
> +			}
> +
> +			switch (i) {
> +			case 0x01 /* 00001b */:
> +				// allow BUMP
> +				break;
> +
> +			case 0x13 /* 10011b */:
> +			case 0x0B /* 01011b */:
> +			case 0x03 /* 00011b */:
> +				del_val += 1; // take the second
> +				disable_rlv_bump_this_byte = 1; // allow no BUMP
> +				break;
> +
> +			case 0x0D /* 01101b */:
> +			case 0x05 /* 00101b */:
> +				// test count of lowest and all
> +				if (delay_count >= 5 || sum_counts <= 5)
> +					del_val += 1; // take the hole
> +				else
> +					del_val += 2; // take the next set
> +				disable_rlv_bump_this_byte = 1; // allow no BUMP
> +				break;
> +
> +			case 0x0F /* 01111b */:
> +			case 0x17 /* 10111b */:
> +			case 0x07 /* 00111b */:
> +				del_val += 1; // take the second
> +				if (delay_count < 5) { // lowest count is small
> +					int second =
> +						rank_perf[rankx].count[i][del_val];
> +					int third =
> +						rank_perf[rankx].count[i][del_val + 1];
> +					// test if middle is more than 1 OR
> +					// top is more than 1;
> +					// this means if they are BOTH 1,
> +					// then we keep the second...
> +					if (second > 1 || third > 1) {
> +						// if middle is small OR top
> +						// is large
> +						if (second < 5 ||
> +						    third > 1) {
> +							// take the top
> +							del_val += 1;
> +							if (rl_print)
> +								debug(": TOP7 ");
> +						}
> +					}
> +				}
> +				disable_rlv_bump_this_byte = 1; // allow no BUMP
> +				break;
> +
> +			default: // all others...
> +				if (rl_print)
> +					debug(": ABNORMAL, choose ORIG");
> +
> +			case 99: // special
> +				 // FIXME: choose original choice?
> +				del_val = new_byte;
> +				disable_rlv_bump_this_byte = 1; // allow no BUMP
> +				break;
> +			}
> +			delay_count =
> +				rank_perf[rankx].count[i][del_val];
> +
> +			// finish printing here...
> +			if (rl_print)
> +				debug(": USING %2d (%2d) D%d\n", del_val,
> +				      delay_count, disable_rlv_bump_this_byte);
> +			new_byte = del_val; // override with best PBM choice
> +		} else {
> +			if (ddr_type == DDR4_DRAM) { // only report when DDR4
> +				// FIXME: remove or increase VBL for this
> +				// output...
> +				if (rl_print)
> +					debug("N%d.LMC%d.R%d: PERFECT: Byte %d: ZERO PBMs, USING %d\n",
> +					      node, if_num, rankx, i,
> +					      new_byte);
> +				// prevent ODD bump, rely on original
> +				disable_rlv_bump_this_byte = 1;
> +			}
> +		} /* if (value_mask != 0) */
> +#endif /* PERFECT_BITMASK_COUNTING */
> +
> +		// optionally bump the delay value
> +		if (enable_rldelay_bump && !disable_rlv_bump_this_byte) {
> +			if ((new_byte & enable_rldelay_bump) ==
> +			    enable_rldelay_bump) {
> +				int bump_value = new_byte + rldelay_bump_incr;
> +
> +				if (rl_print) {
> +					debug("N%d.LMC%d.R%d: RLVBUMP: Byte %d: CHANGING %d to %d (%s)\n",
> +					      node, if_num, rankx, i,
> +					      new_byte, bump_value,
> +					      (value_mask &
> +					       (1 << bump_value)) ?
> +					      "PBM" : "NOPBM");
> +				}
> +				new_byte = bump_value;
> +			}
> +		}
> +
> +		// last checks for count-related purposes
> +		if (new_byte == best_byte && count_more > 0 &&
> +		    count_less == 0) {
> +			// we really should take best_byte + 1
> +			if (rl_print) {
> +				debug("N%d.LMC%d.R%d: CADJMOR: Byte %d: CHANGING %d to %d\n",
> +				      node, if_num, rankx, i,
> +				      new_byte, best_byte + 1);
> +				new_byte = best_byte + 1;
> +			}
> +		} else if ((new_byte < best_byte) && (count_same > 0)) {
> +			// we really should take best_byte
> +			if (rl_print) {
> +				debug("N%d.LMC%d.R%d: CADJSAM: Byte %d: CHANGING %d to %d\n",
> +				      node, if_num, rankx, i,
> +				      new_byte, best_byte);
> +				new_byte = best_byte;
> +			}
> +		} else if (new_byte > best_byte) {
> +			if ((new_byte == (best_byte + 1)) &&
> +			    count_more == 0 && count_less > 0) {
> +				// we really should take best_byte
> +				if (rl_print) {
> +					debug("N%d.LMC%d.R%d: CADJLE1: Byte %d: CHANGING %d to %d\n",
> +					      node, if_num, rankx, i,
> +					      new_byte, best_byte);
> +					new_byte = best_byte;
> +				}
> +			} else if ((new_byte >= (best_byte + 2)) &&
> +				   ((count_more > 0) || (count_same > 0))) {
> +				if (rl_print) {
> +					debug("N%d.LMC%d.R%d: CADJLE2: Byte %d: CHANGING %d to %d\n",
> +					      node, if_num, rankx, i,
> +					      new_byte, best_byte + 1);
> +					new_byte = best_byte + 1;
> +				}
> +			}
> +		}
> +
> +		if (rl_print) {
> +			debug("N%d.LMC%d.R%d: SUMMARY: Byte %d: orig %d now %d, more %d same %d less %d, using %d\n",
> +			      node, if_num, rankx, i, orig_best_byte,
> +			      best_byte, count_more, count_same, count_less,
> +			      new_byte);
> +		}
> +
> +		// update the byte with the new value (NOTE: orig value in
> +		// the CSR may not be current "best")
> +		upd_rl_rank(&rl_rank, i, new_byte);
> +
> +		// save new best for neighbor use
> +		rank_best_bytes[i] = new_byte;
> +	} /* for (i = 0; i < 8+ecc_ena; i++) */
> +
> +	////////////////// this is the end of the BEST BYTE LOOP
> +
> +	if (saved_rl_rank.u64 != rl_rank.u64) {
> +		lmc_wr(priv, CVMX_LMCX_RLEVEL_RANKX(rankx, if_num),
> +		       rl_rank.u64);
> +		rl_rank.u64 = lmc_rd(priv,
> +				     CVMX_LMCX_RLEVEL_RANKX(rankx, if_num));
> +		debug("Adjusting Read-Leveling per-RANK settings.\n");
> +	} else {
> +		debug("Not Adjusting Read-Leveling per-RANK settings.\n");
> +	}
> +	display_rl_with_final(if_num, rl_rank, rankx);
> +
> +	// FIXME: does this help make the output a little easier to focus?
> +	if (rl_print > 0)
> +		debug("-----------\n");
> +
> +#if RLEXTRAS_PATCH
> +#define RLEVEL_RANKX_EXTRAS_INCR  0
> +	// if there are unused entries to be filled
> +	if ((rank_mask & 0x0f) != 0x0f) {
> +		// copy the current rank
> +		union cvmx_lmcx_rlevel_rankx temp_rl_rank = rl_rank;
> +
> +		if (rankx < 3) {
> +#if RLEVEL_RANKX_EXTRAS_INCR > 0
> +			int byte, delay;
> +
> +			// modify the copy in prep for writing to empty slot(s)
> +			for (byte = 0; byte < 9; byte++) {
> +				delay = get_rl_rank(&temp_rl_rank, byte) +
> +					RLEVEL_RANKX_EXTRAS_INCR;
> +				if (delay > RLEVEL_BYTE_MSK)
> +					delay = RLEVEL_BYTE_MSK;
> +				upd_rl_rank(&temp_rl_rank, byte, delay);
> +			}
> +#endif
> +
> +			// if rank 0, write rank 1 and rank 2 here if empty
> +			if (rankx == 0) {
> +				// check that rank 1 is empty
> +				if (!(rank_mask & (1 << 1))) {
> +					debug("N%d.LMC%d.R%d: writing RLEVEL_RANK unused entry R%d.\n",
> +					      node, if_num, rankx, 1);
> +					lmc_wr(priv,
> +					       CVMX_LMCX_RLEVEL_RANKX(1,
> +								      if_num),
> +					       temp_rl_rank.u64);
> +				}
> +
> +				// check that rank 2 is empty
> +				if (!(rank_mask & (1 << 2))) {
> +					debug("N%d.LMC%d.R%d: writing RLEVEL_RANK unused entry R%d.\n",
> +					      node, if_num, rankx, 2);
> +					lmc_wr(priv,
> +					       CVMX_LMCX_RLEVEL_RANKX(2,
> +								      if_num),
> +					       temp_rl_rank.u64);
> +				}
> +			}
> +
> +			// if ranks 0, 1 or 2, write rank 3 here if empty
> +			// check that rank 3 is empty
> +			if (!(rank_mask & (1 << 3))) {
> +				debug("N%d.LMC%d.R%d: writing RLEVEL_RANK unused entry R%d.\n",
> +				      node, if_num, rankx, 3);
> +				lmc_wr(priv, CVMX_LMCX_RLEVEL_RANKX(3, if_num),
> +				       temp_rl_rank.u64);
> +			}
> +		}
> +	}
> +#endif /* RLEXTRAS_PATCH */
> +}
> +
> +static void lmc_read_leveling(struct ddr_priv *priv)
> +{
> +	struct rl_score rl_score[RTT_NOM_OHMS_COUNT][RODT_OHMS_COUNT][4];
> +	union cvmx_lmcx_control ctl;
> +	union cvmx_lmcx_config cfg;
> +	int rankx;
> +	char *s;
> +	int i;
> +
> +	/*
> +	 * 4.8.10 LMC Read Leveling
> +	 *
> +	 * LMC supports an automatic read-leveling separately per byte-lane
> +	 * using the DDR3 multipurpose register predefined pattern for system
> +	 * calibration defined in the JEDEC DDR3 specifications.
> +	 *
> +	 * All of DDR PLL, LMC CK, and LMC DRESET, and early LMC initializations
> +	 * must be completed prior to starting this LMC read-leveling sequence.
> +	 *
> +	 * Software could simply write the desired read-leveling values into
> +	 * LMC(0)_RLEVEL_RANK(0..3). This section describes a sequence that uses
> +	 * LMC's autoread-leveling capabilities.
> +	 *
> +	 * When LMC does the read-leveling sequence for a rank, it first enables
> +	 * the DDR3 multipurpose register predefined pattern for system
> +	 * calibration on the selected DRAM rank via a DDR3 MR3 write, then
> +	 * executes 64 RD operations at different internal delay settings, then
> +	 * disables the predefined pattern via another DDR3 MR3 write
> +	 * operation. LMC determines the pass or fail of each of the 64 settings
> +	 * independently for each byte lane, then writes appropriate
> +	 * LMC(0)_RLEVEL_RANK(0..3)[BYTE*] values for the rank.
> +	 *
> +	 * After read-leveling for a rank, software can read the 64 pass/fail
> +	 * indications for one byte lane via LMC(0)_RLEVEL_DBG[BITMASK].
> +	 * Software can observe all pass/fail results for all byte lanes in a
> +	 * rank via separate read-leveling sequences on the rank with different
> +	 * LMC(0)_RLEVEL_CTL[BYTE] values.
> +	 *
> +	 * The 64 pass/fail results will typically have failures for the low
> +	 * delays, followed by a run of some passing settings, followed by more
> +	 * failures in the remaining high delays.  LMC sets
> +	 * LMC(0)_RLEVEL_RANK(0..3)[BYTE*] to one of the passing settings.
> +	 * First, LMC selects the longest run of successes in the 64 results.
> +	 * (In the unlikely event that there is more than one longest run, LMC
> +	 * selects the first one.) Then if LMC(0)_RLEVEL_CTL[OFFSET_EN] = 1 and
> +	 * the selected run has more than LMC(0)_RLEVEL_CTL[OFFSET] successes,
> +	 * LMC selects the last passing setting in the run minus
> +	 * LMC(0)_RLEVEL_CTL[OFFSET]. Otherwise LMC selects the middle setting
> +	 * in the run (rounding earlier when necessary). We expect the
> +	 * read-leveling sequence to produce good results with the reset values
> +	 * LMC(0)_RLEVEL_CTL [OFFSET_EN]=1, LMC(0)_RLEVEL_CTL[OFFSET] = 2.
> +	 *
> +	 * The read-leveling sequence has the following steps:
> +	 *
> +	 * 1. Select desired LMC(0)_RLEVEL_CTL[OFFSET_EN,OFFSET,BYTE] settings.
> +	 *    Do the remaining substeps 2-4 separately for each rank i with
> +	 *    attached DRAM.
> +	 *
> +	 * 2. Without changing any other fields in LMC(0)_CONFIG,
> +	 *
> +	 *    o write LMC(0)_SEQ_CTL[SEQ_SEL] to select read-leveling
> +	 *
> +	 *    o write LMC(0)_CONFIG[RANKMASK] = (1 << i)
> +	 *
> +	 *    o write LMC(0)_SEQ_CTL[INIT_START] = 1
> +	 *
> +	 *    This initiates the previously-described read-leveling.
> +	 *
> +	 * 3. Wait until LMC(0)_RLEVEL_RANKi[STATUS] != 2
> +	 *
> +	 *    LMC will have updated LMC(0)_RLEVEL_RANKi[BYTE*] for all byte
> +	 *    lanes at this point.
> +	 *
> +	 *    If ECC DRAM is not present (i.e. when DRAM is not attached to the
> +	 *    DDR_CBS_0_* and DDR_CB<7:0> chip signals, or the DDR_DQS_<4>_* and
> +	 *    DDR_DQ<35:32> chip signals), write LMC(0)_RLEVEL_RANK*[BYTE8] =
> +	 *    LMC(0)_RLEVEL_RANK*[BYTE0]. Write LMC(0)_RLEVEL_RANK*[BYTE4] =
> +	 *    LMC(0)_RLEVEL_RANK*[BYTE0].
> +	 *
> +	 * 4. If desired, consult LMC(0)_RLEVEL_DBG[BITMASK] and compare to
> +	 *    LMC(0)_RLEVEL_RANKi[BYTE*] for the lane selected by
> +	 *    LMC(0)_RLEVEL_CTL[BYTE]. If desired, modify
> +	 *    LMC(0)_RLEVEL_CTL[BYTE] to a new value and repeat so that all
> +	 *    BITMASKs can be observed.
> +	 *
> +	 * 5. Initialize LMC(0)_RLEVEL_RANK* values for all unused ranks.
> +	 *
> +	 *    Let rank i be a rank with attached DRAM.
> +	 *
> +	 *    For all ranks j that do not have attached DRAM, set
> +	 *    LMC(0)_RLEVEL_RANKj = LMC(0)_RLEVEL_RANKi.
> +	 *
> +	 * This read-leveling sequence can help select the proper CN70XX ODT
> +	 * resistance value (LMC(0)_COMP_CTL2[RODT_CTL]). A hardware-generated
> +	 * LMC(0)_RLEVEL_RANKi[BYTEj] value (for a used byte lane j) that is
> +	 * drastically different from a neighboring LMC(0)_RLEVEL_RANKi[BYTEk]
> +	 * (for a used byte lane k) can indicate that the CN70XX ODT value is
> +	 * bad. It is possible to simultaneously optimize both
> +	 * LMC(0)_COMP_CTL2[RODT_CTL] and LMC(0)_RLEVEL_RANKn[BYTE*] values by
> +	 * performing this read-leveling sequence for several
> +	 * LMC(0)_COMP_CTL2[RODT_CTL] values and selecting the one with the
> +	 * best LMC(0)_RLEVEL_RANKn[BYTE*] profile for the ranks.
> +	 */
> +
> +	rl_rodt_err = 0;
> +	rl_dbg_loops = 1;
> +	saved_int_zqcs_dis = 0;
> +	max_adj_rl_del_inc = 0;
> +	rl_print = RLEVEL_PRINTALL_DEFAULT;
> +
> +#ifdef ENABLE_HARDCODED_RLEVEL
> +	part_number[21] = {0};
> +#endif /* ENABLE_HARDCODED_RLEVEL */
> +
> +#if PERFECT_BITMASK_COUNTING
> +	pbm_lowsum_limit = 5; // FIXME: is this a good default?
> +	// FIXME: PBM skip for RODT 240 and 34
> +	pbm_rodt_skip = (1U << ddr4_rodt_ctl_240_ohm) |
> +		(1U << ddr4_rodt_ctl_34_ohm);
> +#endif /* PERFECT_BITMASK_COUNTING */
> +
> +	disable_rank_majority = 0; // control rank majority processing
> +
> +	// default to mask 11b ODDs for DDR4 (except 73xx), else DISABLE
> +	// for DDR3
> +	rldelay_bump_incr = 0;
> +	disable_rlv_bump_this_byte = 0;
> +
> +	enable_rldelay_bump = (ddr_type == DDR4_DRAM) ?
> +		((octeon_is_cpuid(OCTEON_CN73XX)) ? 1 : 3) : 0;
> +
> +	s = lookup_env(priv, "ddr_disable_rank_majority");
> +	if (s)
> +		disable_rank_majority = !!simple_strtoul(s, NULL, 0);
> +
> +#if PERFECT_BITMASK_COUNTING
> +	s = lookup_env(priv, "ddr_pbm_lowsum_limit");
> +	if (s)
> +		pbm_lowsum_limit = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_pbm_rodt_skip");
> +	if (s)
> +		pbm_rodt_skip = simple_strtoul(s, NULL, 0);
> +	memset(rank_perf, 0, sizeof(rank_perf));
> +#endif /* PERFECT_BITMASK_COUNTING */
> +
> +	ctl.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +	save_ddr2t = ctl.cn78xx.ddr2t;
> +
> +	cfg.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(if_num));
> +	ecc_ena = cfg.cn78xx.ecc_ena;
> +
> +	s = lookup_env(priv, "ddr_rlevel_2t");
> +	if (s)
> +		ctl.cn78xx.ddr2t = simple_strtoul(s, NULL, 0);
> +
> +	lmc_wr(priv, CVMX_LMCX_CONTROL(if_num), ctl.u64);
> +
> +	debug("LMC%d: Performing Read-Leveling\n", if_num);
> +
> +	rl_ctl.u64 = lmc_rd(priv, CVMX_LMCX_RLEVEL_CTL(if_num));
> +
> +	rl_samples = c_cfg->rlevel_average_loops;
> +	if (rl_samples == 0) {
> +		rl_samples = RLEVEL_SAMPLES_DEFAULT;
> +		// up the samples for these cases
> +		if (dimm_count == 1 || num_ranks == 1)
> +			rl_samples = rl_samples * 2 + 1;
> +	}
> +
> +	rl_compute = c_cfg->rlevel_compute;
> +	rl_ctl.cn78xx.offset_en = c_cfg->offset_en;
> +	rl_ctl.cn78xx.offset    = spd_rdimm
> +		? c_cfg->offset_rdimm
> +		: c_cfg->offset_udimm;
> +
> +	int value = 1; // should ALWAYS be set
> +
> +	s = lookup_env(priv, "ddr_rlevel_delay_unload");
> +	if (s)
> +		value = !!simple_strtoul(s, NULL, 0);
> +	rl_ctl.cn78xx.delay_unload_0 = value;
> +	rl_ctl.cn78xx.delay_unload_1 = value;
> +	rl_ctl.cn78xx.delay_unload_2 = value;
> +	rl_ctl.cn78xx.delay_unload_3 = value;
> +
> +	// use OR_DIS=1 to try for better results
> +	rl_ctl.cn78xx.or_dis = 1;
> +
> +	/*
> +	 * If we will be switching to 32bit mode level based on only
> +	 * four bits because there are only 4 ECC bits.
> +	 */
> +	rl_ctl.cn78xx.bitmask = (if_64b) ? 0xFF : 0x0F;
> +
> +	// allow overrides
> +	s = lookup_env(priv, "ddr_rlevel_ctl_or_dis");
> +	if (s)
> +		rl_ctl.cn78xx.or_dis = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_rlevel_ctl_bitmask");
> +	if (s)
> +		rl_ctl.cn78xx.bitmask = simple_strtoul(s, NULL, 0);
> +
> +	rl_comp_offs = spd_rdimm
> +		? c_cfg->rlevel_comp_offset_rdimm
> +		: c_cfg->rlevel_comp_offset_udimm;
> +	s = lookup_env(priv, "ddr_rlevel_comp_offset");
> +	if (s)
> +		rl_comp_offs = strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_rlevel_offset");
> +	if (s)
> +		rl_ctl.cn78xx.offset   = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_rlevel_offset_en");
> +	if (s)
> +		rl_ctl.cn78xx.offset_en   = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_rlevel_ctl");
> +	if (s)
> +		rl_ctl.u64   = simple_strtoul(s, NULL, 0);
> +
> +	lmc_wr(priv,
> +	       CVMX_LMCX_RLEVEL_CTL(if_num),
> +	       rl_ctl.u64);
> +
> +	// do this here so we can look at final RLEVEL_CTL[offset] setting...
> +	s = lookup_env(priv, "ddr_enable_rldelay_bump");
> +	if (s) {
> +		// also use as mask bits
> +		enable_rldelay_bump = strtoul(s, NULL, 0);
> +	}
> +
> +	if (enable_rldelay_bump != 0)
> +		rldelay_bump_incr = (rl_ctl.cn78xx.offset == 1) ? -1 : 1;
> +
> +	s = lookup_env(priv, "ddr%d_rlevel_debug_loops", if_num);
> +	if (s)
> +		rl_dbg_loops = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_rtt_nom_auto");
> +	if (s)
> +		ddr_rtt_nom_auto = !!simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_rlevel_average");
> +	if (s)
> +		rl_samples = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_rlevel_compute");
> +	if (s)
> +		rl_compute = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_rlevel_printall");
> +	if (s)
> +		rl_print = simple_strtoul(s, NULL, 0);
> +
> +	debug("RLEVEL_CTL                                    : 0x%016llx\n",
> +	      rl_ctl.u64);
> +	debug("RLEVEL_OFFSET                                 : %6d\n",
> +	      rl_ctl.cn78xx.offset);
> +	debug("RLEVEL_OFFSET_EN                              : %6d\n",
> +	      rl_ctl.cn78xx.offset_en);
> +
> +	/*
> +	 * The purpose for the indexed table is to sort the settings
> +	 * by the ohm value to simplify the testing when incrementing
> +	 * through the settings.  (index => ohms) 1=120, 2=60, 3=40,
> +	 * 4=30, 5=20
> +	 */
> +	min_rtt_nom_idx = (c_cfg->min_rtt_nom_idx == 0) ?
> +		1 : c_cfg->min_rtt_nom_idx;
> +	max_rtt_nom_idx = (c_cfg->max_rtt_nom_idx == 0) ?
> +		5 : c_cfg->max_rtt_nom_idx;
> +
> +	min_rodt_ctl = (c_cfg->min_rodt_ctl == 0) ? 1 : c_cfg->min_rodt_ctl;
> +	max_rodt_ctl = (c_cfg->max_rodt_ctl == 0) ? 5 : c_cfg->max_rodt_ctl;
> +
> +	s = lookup_env(priv, "ddr_min_rodt_ctl");
> +	if (s)
> +		min_rodt_ctl = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_max_rodt_ctl");
> +	if (s)
> +		max_rodt_ctl = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_min_rtt_nom_idx");
> +	if (s)
> +		min_rtt_nom_idx = simple_strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_max_rtt_nom_idx");
> +	if (s)
> +		max_rtt_nom_idx = simple_strtoul(s, NULL, 0);
> +
> +#ifdef ENABLE_HARDCODED_RLEVEL
> +	if (c_cfg->rl_tbl) {
> +		/* Check for hard-coded read-leveling settings */
> +		get_dimm_part_number(part_number, &dimm_config_table[0],
> +				     0, ddr_type);
> +		for (rankx = 0; rankx < dimm_count * 4; rankx++) {
> +			if (!(rank_mask & (1 << rankx)))
> +				continue;
> +
> +			rl_rank.u64 = lmc_rd(priv,
> +					     CVMX_LMCX_RLEVEL_RANKX(rankx,
> +								    if_num));
> +
> +			i = 0;
> +			while (c_cfg->rl_tbl[i].part) {
> +				debug("DIMM part number:\"%s\", SPD: \"%s\"\n",
> +				      c_cfg->rl_tbl[i].part, part_number);
> +				if ((strcmp(part_number,
> +					    c_cfg->rl_tbl[i].part) == 0) &&
> +				    (abs(c_cfg->rl_tbl[i].speed -
> +					 2 * ddr_hertz / (1000 * 1000)) < 10)) {
> +					debug("Using hard-coded read leveling for DIMM part number: \"%s\"\n",
> +					      part_number);
> +					rl_rank.u64 =
> +						c_cfg->rl_tbl[i].rl_rank[if_num][rankx];
> +					lmc_wr(priv,
> +					       CVMX_LMCX_RLEVEL_RANKX(rankx,
> +								      if_num),
> +					       rl_rank.u64);
> +					rl_rank.u64 =
> +						lmc_rd(priv,
> +						       CVMX_LMCX_RLEVEL_RANKX(rankx,
> +									      if_num));
> +					display_rl(if_num, rl_rank, rankx);
> +					/* Disable h/w read-leveling */
> +					rl_dbg_loops = 0;
> +					break;
> +				}
> +				++i;
> +			}
> +		}
> +	}
> +#endif /* ENABLE_HARDCODED_RLEVEL */
> +
> +	max_adj_rl_del_inc = c_cfg->maximum_adjacent_rlevel_delay_increment;
> +	s = lookup_env(priv, "ddr_maximum_adjacent_rlevel_delay_increment");
> +	if (s)
> +		max_adj_rl_del_inc = strtoul(s, NULL, 0);
> +
> +	while (rl_dbg_loops--) {
> +		union cvmx_lmcx_modereg_params1 mp1;
> +		union cvmx_lmcx_comp_ctl2 cc2;
> +
> +		/* Initialize the error scoreboard */
> +		memset(rl_score, 0, sizeof(rl_score));
> +
> +		cc2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +		saved_ddr__ptune = cc2.cn78xx.ddr__ptune;
> +		saved_ddr__ntune = cc2.cn78xx.ddr__ntune;
> +
> +		/* Disable dynamic compensation settings */
> +		if (rl_comp_offs != 0) {
> +			cc2.cn78xx.ptune = saved_ddr__ptune;
> +			cc2.cn78xx.ntune = saved_ddr__ntune;
> +
> +			/*
> +			 * Round up the ptune calculation to bias the odd
> +			 * cases toward ptune
> +			 */
> +			cc2.cn78xx.ptune += divide_roundup(rl_comp_offs, 2);
> +			cc2.cn78xx.ntune -= rl_comp_offs / 2;
> +
> +			ctl.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +			saved_int_zqcs_dis = ctl.s.int_zqcs_dis;
> +			/* Disable ZQCS while in bypass. */
> +			ctl.s.int_zqcs_dis = 1;
> +			lmc_wr(priv, CVMX_LMCX_CONTROL(if_num), ctl.u64);
> +
> +			cc2.cn78xx.byp = 1; /* Enable bypass mode */
> +			lmc_wr(priv, CVMX_LMCX_COMP_CTL2(if_num), cc2.u64);
> +			lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +			/* Read again */
> +			cc2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +			debug("DDR__PTUNE/DDR__NTUNE                         : %d/%d\n",
> +			      cc2.cn78xx.ddr__ptune, cc2.cn78xx.ddr__ntune);
> +		}
> +
> +		mp1.u64 = lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS1(if_num));
> +
> +		for (rtt_idx = min_rtt_nom_idx; rtt_idx <= max_rtt_nom_idx;
> +		     ++rtt_idx) {
> +			rtt_nom = imp_val->rtt_nom_table[rtt_idx];
> +
> +			/*
> +			 * When the read ODT mask is zero the dyn_rtt_nom_mask
> +			 * is zero than RTT_NOM will not be changing during
> +			 * read-leveling.  Since the value is fixed we only need
> +			 * to test it once.
> +			 */
> +			if (dyn_rtt_nom_mask == 0) {
> +				// flag not to print NOM ohms
> +				print_nom_ohms = -1;
> +			} else {
> +				if (dyn_rtt_nom_mask & 1)
> +					mp1.s.rtt_nom_00 = rtt_nom;
> +				if (dyn_rtt_nom_mask & 2)
> +					mp1.s.rtt_nom_01 = rtt_nom;
> +				if (dyn_rtt_nom_mask & 4)
> +					mp1.s.rtt_nom_10 = rtt_nom;
> +				if (dyn_rtt_nom_mask & 8)
> +					mp1.s.rtt_nom_11 = rtt_nom;
> +				// FIXME? rank 0 ohms always?
> +				print_nom_ohms =
> +					imp_val->rtt_nom_ohms[mp1.s.rtt_nom_00];
> +			}
> +
> +			lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS1(if_num),
> +			       mp1.u64);
> +
> +			if (print_nom_ohms >= 0 && rl_print > 1) {
> +				debug("\n");
> +				debug("RTT_NOM     %3d, %3d, %3d, %3d ohms           :  %x,%x,%x,%x\n",
> +				      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_11],
> +				      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_10],
> +				      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_01],
> +				      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_00],
> +				      mp1.s.rtt_nom_11,
> +				      mp1.s.rtt_nom_10,
> +				      mp1.s.rtt_nom_01,
> +				      mp1.s.rtt_nom_00);
> +			}
> +
> +			ddr_init_seq(priv, rank_mask, if_num);
> +
> +			// Try RANK outside RODT to rearrange the output...
> +			for (rankx = 0; rankx < dimm_count * 4; rankx++) {
> +				if (!(rank_mask & (1 << rankx)))
> +					continue;
> +
> +				for (rodt_ctl = max_rodt_ctl;
> +				     rodt_ctl >= min_rodt_ctl; --rodt_ctl)
> +					rodt_loop(priv, rankx, rl_score);
> +			}
> +		}
> +
> +		/* Re-enable dynamic compensation settings. */
> +		if (rl_comp_offs != 0) {
> +			cc2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +
> +			cc2.cn78xx.ptune = 0;
> +			cc2.cn78xx.ntune = 0;
> +			cc2.cn78xx.byp = 0; /* Disable bypass mode */
> +			lmc_wr(priv, CVMX_LMCX_COMP_CTL2(if_num), cc2.u64);
> +			/* Read once */
> +			lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +
> +			/* Read again */
> +			cc2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +			debug("DDR__PTUNE/DDR__NTUNE                         : %d/%d\n",
> +			      cc2.cn78xx.ddr__ptune, cc2.cn78xx.ddr__ntune);
> +
> +			ctl.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +			/* Restore original setting */
> +			ctl.s.int_zqcs_dis = saved_int_zqcs_dis;
> +			lmc_wr(priv, CVMX_LMCX_CONTROL(if_num), ctl.u64);
> +		}
> +
> +		int override_compensation = 0;
> +
> +		s = lookup_env(priv, "ddr__ptune");
> +		if (s)
> +			saved_ddr__ptune = strtoul(s, NULL, 0);
> +
> +		s = lookup_env(priv, "ddr__ntune");
> +		if (s) {
> +			saved_ddr__ntune = strtoul(s, NULL, 0);
> +			override_compensation = 1;
> +		}
> +
> +		if (override_compensation) {
> +			cc2.cn78xx.ptune = saved_ddr__ptune;
> +			cc2.cn78xx.ntune = saved_ddr__ntune;
> +
> +			ctl.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +			saved_int_zqcs_dis = ctl.s.int_zqcs_dis;
> +			/* Disable ZQCS while in bypass. */
> +			ctl.s.int_zqcs_dis = 1;
> +			lmc_wr(priv, CVMX_LMCX_CONTROL(if_num), ctl.u64);
> +
> +			cc2.cn78xx.byp = 1; /* Enable bypass mode */
> +			lmc_wr(priv, CVMX_LMCX_COMP_CTL2(if_num), cc2.u64);
> +			/* Read again */
> +			cc2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +
> +			debug("DDR__PTUNE/DDR__NTUNE                         : %d/%d\n",
> +			      cc2.cn78xx.ptune, cc2.cn78xx.ntune);
> +		}
> +
> +		/* Evaluation block */
> +		/* Still at initial value? */
> +		int best_rodt_score = DEFAULT_BEST_RANK_SCORE;
> +		int auto_rodt_ctl = 0;
> +		int auto_rtt_nom  = 0;
> +		int rodt_score;
> +
> +		rodt_row_skip_mask = 0;
> +
> +		// just add specific RODT rows to the skip mask for DDR4
> +		// at this time...
> +		if (ddr_type == DDR4_DRAM) {
> +			// skip RODT row 34 ohms for all DDR4 types
> +			rodt_row_skip_mask |= (1 << ddr4_rodt_ctl_34_ohm);
> +			// skip RODT row 40 ohms for all DDR4 types
> +			rodt_row_skip_mask |= (1 << ddr4_rodt_ctl_40_ohm);
> +#if ADD_48_OHM_SKIP
> +			// skip RODT row 48 ohms for all DDR4 types
> +			rodt_row_skip_mask |= (1 << ddr4_rodt_ctl_48_ohm);
> +#endif /* ADD_48OHM_SKIP */
> +#if NOSKIP_40_48_OHM
> +			// For now, do not skip RODT row 40 or 48 ohm when
> +			// ddr_hertz is above 1075 MHz
> +			if (ddr_hertz > 1075000000) {
> +				// noskip RODT row 40 ohms
> +				rodt_row_skip_mask &=
> +					~(1 << ddr4_rodt_ctl_40_ohm);
> +				// noskip RODT row 48 ohms
> +				rodt_row_skip_mask &=
> +					~(1 << ddr4_rodt_ctl_48_ohm);
> +			}
> +#endif /* NOSKIP_40_48_OHM */
> +#if NOSKIP_48_STACKED
> +			// For now, do not skip RODT row 48 ohm for 2Rx4
> +			// stacked die DIMMs
> +			if (is_stacked_die && num_ranks == 2 &&
> +			    dram_width == 4) {
> +				// noskip RODT row 48 ohms
> +				rodt_row_skip_mask &=
> +					~(1 << ddr4_rodt_ctl_48_ohm);
> +			}
> +#endif /* NOSKIP_48_STACKED */
> +#if NOSKIP_FOR_MINI
> +			// for now, leave all rows eligible when we have
> +			// mini-DIMMs...
> +			if (spd_dimm_type == 5 || spd_dimm_type == 6)
> +				rodt_row_skip_mask = 0;
> +#endif /* NOSKIP_FOR_MINI */
> +#if NOSKIP_FOR_2S_1R
> +			// for now, leave all rows eligible when we have
> +			// a 2-slot 1-rank config
> +			if (dimm_count == 2 && num_ranks == 1)
> +				rodt_row_skip_mask = 0;
> +#endif /* NOSKIP_FOR_2S_1R */
> +
> +			debug("Evaluating Read-Leveling Scoreboard for AUTO settings.\n");
> +			for (rtt_idx = min_rtt_nom_idx;
> +			     rtt_idx <= max_rtt_nom_idx; ++rtt_idx) {
> +				rtt_nom = imp_val->rtt_nom_table[rtt_idx];
> +
> +				for (rodt_ctl = max_rodt_ctl;
> +				     rodt_ctl >= min_rodt_ctl; --rodt_ctl) {
> +					rodt_score = 0;
> +					for (rankx = 0; rankx < dimm_count * 4;
> +					     rankx++) {
> +						if (!(rank_mask & (1 << rankx)))
> +							continue;
> +
> +						debug("rl_score[rtt_nom=%d][rodt_ctl=%d][rankx=%d].score:%d\n",
> +						      rtt_nom, rodt_ctl, rankx,
> +						      rl_score[rtt_nom][rodt_ctl][rankx].score);
> +						rodt_score +=
> +							rl_score[rtt_nom][rodt_ctl][rankx].score;
> +					}
> +					// FIXME: do we need to skip RODT rows
> +					// here, like we do below in the
> +					// by-RANK settings?
> +
> +					/*
> +					 * When using automatic ODT settings use
> +					 * the ODT settings associated with the
> +					 * best score for all of the tested ODT
> +					 * combinations.
> +					 */
> +
> +					if (rodt_score < best_rodt_score ||
> +					    (rodt_score == best_rodt_score &&
> +					     (imp_val->rodt_ohms[rodt_ctl] >
> +					      imp_val->rodt_ohms[auto_rodt_ctl]))) {
> +						debug("AUTO: new best score for rodt:%d (%d), new score:%d, previous score:%d\n",
> +						      rodt_ctl,
> +						      imp_val->rodt_ohms[rodt_ctl],
> +						      rodt_score,
> +						      best_rodt_score);
> +						best_rodt_score = rodt_score;
> +						auto_rodt_ctl   = rodt_ctl;
> +						auto_rtt_nom    = rtt_nom;
> +					}
> +				}
> +			}
> +
> +			mp1.u64 = lmc_rd(priv,
> +					 CVMX_LMCX_MODEREG_PARAMS1(if_num));
> +
> +			if (ddr_rtt_nom_auto) {
> +				/* Store the automatically set RTT_NOM value */
> +				if (dyn_rtt_nom_mask & 1)
> +					mp1.s.rtt_nom_00 = auto_rtt_nom;
> +				if (dyn_rtt_nom_mask & 2)
> +					mp1.s.rtt_nom_01 = auto_rtt_nom;
> +				if (dyn_rtt_nom_mask & 4)
> +					mp1.s.rtt_nom_10 = auto_rtt_nom;
> +				if (dyn_rtt_nom_mask & 8)
> +					mp1.s.rtt_nom_11 = auto_rtt_nom;
> +			} else {
> +				/*
> +				 * restore the manual settings to the register
> +				 */
> +				mp1.s.rtt_nom_00 = default_rtt_nom[0];
> +				mp1.s.rtt_nom_01 = default_rtt_nom[1];
> +				mp1.s.rtt_nom_10 = default_rtt_nom[2];
> +				mp1.s.rtt_nom_11 = default_rtt_nom[3];
> +			}
> +
> +			lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS1(if_num),
> +			       mp1.u64);
> +			debug("RTT_NOM     %3d, %3d, %3d, %3d ohms           :  %x,%x,%x,%x\n",
> +			      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_11],
> +			      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_10],
> +			      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_01],
> +			      imp_val->rtt_nom_ohms[mp1.s.rtt_nom_00],
> +			      mp1.s.rtt_nom_11,
> +			      mp1.s.rtt_nom_10,
> +			      mp1.s.rtt_nom_01,
> +			      mp1.s.rtt_nom_00);
> +
> +			debug("RTT_WR      %3d, %3d, %3d, %3d ohms           :  %x,%x,%x,%x\n",
> +			      imp_val->rtt_wr_ohms[extr_wr(mp1.u64, 3)],
> +			      imp_val->rtt_wr_ohms[extr_wr(mp1.u64, 2)],
> +			      imp_val->rtt_wr_ohms[extr_wr(mp1.u64, 1)],
> +			      imp_val->rtt_wr_ohms[extr_wr(mp1.u64, 0)],
> +			      extr_wr(mp1.u64, 3),
> +			      extr_wr(mp1.u64, 2),
> +			      extr_wr(mp1.u64, 1),
> +			      extr_wr(mp1.u64, 0));
> +
> +			debug("DIC         %3d, %3d, %3d, %3d ohms           :  %x,%x,%x,%x\n",
> +			      imp_val->dic_ohms[mp1.s.dic_11],
> +			      imp_val->dic_ohms[mp1.s.dic_10],
> +			      imp_val->dic_ohms[mp1.s.dic_01],
> +			      imp_val->dic_ohms[mp1.s.dic_00],
> +			      mp1.s.dic_11,
> +			      mp1.s.dic_10,
> +			      mp1.s.dic_01,
> +			      mp1.s.dic_00);
> +
> +			if (ddr_type == DDR4_DRAM) {
> +				union cvmx_lmcx_modereg_params2 mp2;
> +				/*
> +				 * We must read the CSR, and not depend on
> +				 * odt_config[odt_idx].odt_mask2, since we could
> +				 * have overridden values with envvars.
> +				 * NOTE: this corrects the printout, since the
> +				 * CSR is not written with the old values...
> +				 */
> +				mp2.u64 = lmc_rd(priv,
> +						 CVMX_LMCX_MODEREG_PARAMS2(if_num));
> +
> +				debug("RTT_PARK    %3d, %3d, %3d, %3d ohms           :  %x,%x,%x,%x\n",
> +				      imp_val->rtt_nom_ohms[mp2.s.rtt_park_11],
> +				      imp_val->rtt_nom_ohms[mp2.s.rtt_park_10],
> +				      imp_val->rtt_nom_ohms[mp2.s.rtt_park_01],
> +				      imp_val->rtt_nom_ohms[mp2.s.rtt_park_00],
> +				      mp2.s.rtt_park_11,
> +				      mp2.s.rtt_park_10,
> +				      mp2.s.rtt_park_01,
> +				      mp2.s.rtt_park_00);
> +
> +				debug("%-45s :  0x%x,0x%x,0x%x,0x%x\n",
> +				      "VREF_RANGE",
> +				      mp2.s.vref_range_11,
> +				      mp2.s.vref_range_10,
> +				      mp2.s.vref_range_01,
> +				      mp2.s.vref_range_00);
> +
> +				debug("%-45s :  0x%x,0x%x,0x%x,0x%x\n",
> +				      "VREF_VALUE",
> +				      mp2.s.vref_value_11,
> +				      mp2.s.vref_value_10,
> +				      mp2.s.vref_value_01,
> +				      mp2.s.vref_value_00);
> +			}
> +
> +			cc2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +			if (ddr_rodt_ctl_auto) {
> +				cc2.cn78xx.rodt_ctl = auto_rodt_ctl;
> +			} else {
> +				// back to the original setting
> +				cc2.cn78xx.rodt_ctl = default_rodt_ctl;
> +			}
> +			lmc_wr(priv, CVMX_LMCX_COMP_CTL2(if_num), cc2.u64);
> +			cc2.u64 = lmc_rd(priv, CVMX_LMCX_COMP_CTL2(if_num));
> +			debug("Read ODT_CTL                                  : 0x%x (%d ohms)\n",
> +			      cc2.cn78xx.rodt_ctl,
> +			      imp_val->rodt_ohms[cc2.cn78xx.rodt_ctl]);
> +
> +			/*
> +			 * Use the delays associated with the best score for
> +			 * each individual rank
> +			 */
> +			debug("Evaluating Read-Leveling Scoreboard for per-RANK settings.\n");
> +
> +			// this is the the RANK MAJOR LOOP
> +			for (rankx = 0; rankx < dimm_count * 4; rankx++)
> +				rank_major_loop(priv, rankx, rl_score);
> +		}  /* Evaluation block */
> +	} /* while(rl_dbg_loops--) */
> +
> +	ctl.cn78xx.ddr2t = save_ddr2t;
> +	lmc_wr(priv, CVMX_LMCX_CONTROL(if_num), ctl.u64);
> +	ctl.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +	/* Display final 2T value */
> +	debug("DDR2T                                         : %6d\n",
> +	      ctl.cn78xx.ddr2t);
> +
> +	ddr_init_seq(priv, rank_mask, if_num);
> +
> +	for (rankx = 0; rankx < dimm_count * 4; rankx++) {
> +		u64 value;
> +		int parameter_set = 0;
> +
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +
> +		rl_rank.u64 = lmc_rd(priv, CVMX_LMCX_RLEVEL_RANKX(rankx,
> +								  if_num));
> +
> +		for (i = 0; i < 9; ++i) {
> +			s = lookup_env(priv, "ddr%d_rlevel_rank%d_byte%d",
> +				       if_num, rankx, i);
> +			if (s) {
> +				parameter_set |= 1;
> +				value = simple_strtoul(s, NULL, 0);
> +
> +				upd_rl_rank(&rl_rank, i, value);
> +			}
> +		}
> +
> +		s = lookup_env_ull(priv, "ddr%d_rlevel_rank%d", if_num, rankx);
> +		if (s) {
> +			parameter_set |= 1;
> +			value = simple_strtoull(s, NULL, 0);
> +			rl_rank.u64 = value;
> +		}
> +
> +		if (parameter_set) {
> +			lmc_wr(priv,
> +			       CVMX_LMCX_RLEVEL_RANKX(rankx, if_num),
> +			       rl_rank.u64);
> +			rl_rank.u64 = lmc_rd(priv,
> +					     CVMX_LMCX_RLEVEL_RANKX(rankx,
> +								    if_num));
> +			display_rl(if_num, rl_rank, rankx);
> +		}
> +	}
> +}
> +
> +int init_octeon3_ddr3_interface(struct ddr_priv *priv,
> +				struct ddr_conf *_ddr_conf, u32 _ddr_hertz,
> +				u32 cpu_hertz, u32 ddr_ref_hertz, int _if_num,
> +				u32 _if_mask)
> +{
> +	union cvmx_lmcx_control ctrl;
> +	int ret;
> +	char *s;
> +	int i;
> +
> +	if_num = _if_num;
> +	ddr_hertz = _ddr_hertz;
> +	ddr_conf = _ddr_conf;
> +	if_mask = _if_mask;
> +	odt_1rank_config = ddr_conf->odt_1rank_config;
> +	odt_2rank_config = ddr_conf->odt_2rank_config;
> +	odt_4rank_config = ddr_conf->odt_4rank_config;
> +	dimm_config_table = ddr_conf->dimm_config_table;
> +	c_cfg = &ddr_conf->custom_lmc_config;
> +
> +	/*
> +	 * Compute clock rates to the nearest picosecond.
> +	 */
> +	tclk_psecs = hertz_to_psecs(ddr_hertz);	/* Clock in psecs */
> +	eclk_psecs = hertz_to_psecs(cpu_hertz);	/* Clock in psecs */
> +
> +	dimm_count = 0;
> +	/* Accumulate and report all the errors before giving up */
> +	fatal_error = 0;
> +
> +	/* Flag that indicates safe DDR settings should be used */
> +	safe_ddr_flag = 0;
> +	if_64b = 1;		/* Octeon II Default: 64bit interface width */
> +	mem_size_mbytes = 0;
> +	bank_bits = 0;
> +	column_bits_start = 1;
> +	use_ecc = 1;
> +	min_cas_latency = 0, max_cas_latency = 0, override_cas_latency = 0;
> +	spd_package = 0;
> +	spd_rawcard = 0;
> +	spd_rawcard_aorb = 0;
> +	spd_rdimm_registers = 0;
> +	is_stacked_die = 0;
> +	is_3ds_dimm = 0;	// 3DS
> +	lranks_per_prank = 1;	// 3DS: logical ranks per package rank
> +	lranks_bits = 0;	// 3DS: logical ranks bits
> +	die_capacity = 0;	// in Mbits; only used for 3DS
> +
> +	wl_mask_err = 0;
> +	dyn_rtt_nom_mask = 0;
> +	ddr_disable_chip_reset = 1;
> +	match_wl_rtt_nom = 0;
> +
> +	internal_retries = 0;
> +
> +	disable_deskew_training = 0;
> +	restart_if_dsk_incomplete = 0;
> +	last_lane = ((if_64b) ? 8 : 4) + use_ecc;
> +
> +	disable_sequential_delay_check = 0;
> +	wl_print = WLEVEL_PRINTALL_DEFAULT;
> +
> +#if ALLOW_BY_RANK_INIT
> +	enable_by_rank_init = 1;	// FIXME: default by-rank ON
> +	saved_rank_mask = 0;
> +#endif /* ALLOW_BY_RANK_INIT */
> +
> +	node = 0;
> +
> +#if SWL_TRY_HWL_ALT
> +	memset(hwl_alts, 0, sizeof(hwl_alts));
> +#endif
> +
> +	/*
> +	 * Initialize these to shut up the compiler. They are configured
> +	 * and used only for DDR4
> +	 */
> +	ddr4_trrd_lmin = 6000;
> +	ddr4_tccd_lmin = 6000;
> +
> +	debug("\nInitializing node %d DDR interface %d, DDR Clock %d, DDR Reference Clock %d, CPUID 0x%08x\n",
> +	      node, if_num, ddr_hertz, ddr_ref_hertz, read_c0_prid());
> +
> +	if (dimm_config_table[0].spd_addrs[0] == 0 &&
> +	    !dimm_config_table[0].spd_ptrs[0]) {
> +		printf("ERROR: No dimms specified in the dimm_config_table.\n");
> +		return -1;
> +	}
> +
> +	// allow some overrides to be done
> +
> +	// this one controls several things related to DIMM geometry: HWL and RL
> +	disable_sequential_delay_check = c_cfg->disable_sequential_delay_check;
> +	s = lookup_env(priv, "ddr_disable_sequential_delay_check");
> +	if (s)
> +		disable_sequential_delay_check = strtoul(s, NULL, 0);
> +
> +	// this one controls whether chip RESET is done, or LMC init restarted
> +	// from step 6.9.6
> +	s = lookup_env(priv, "ddr_disable_chip_reset");
> +	if (s)
> +		ddr_disable_chip_reset = !!strtoul(s, NULL, 0);
> +
> +	// this one controls whether Deskew Training is performed
> +	s = lookup_env(priv, "ddr_disable_deskew_training");
> +	if (s)
> +		disable_deskew_training = !!strtoul(s, NULL, 0);
> +
> +	if (ddr_verbose(priv)) {
> +		printf("DDR SPD Table:");
> +		for (didx = 0; didx < DDR_CFG_T_MAX_DIMMS; ++didx) {
> +			if (dimm_config_table[didx].spd_addrs[0] == 0)
> +				break;
> +
> +			printf(" --ddr%dspd=0x%02x", if_num,
> +			       dimm_config_table[didx].spd_addrs[0]);
> +			if (dimm_config_table[didx].spd_addrs[1] != 0)
> +				printf(",0x%02x",
> +				       dimm_config_table[didx].spd_addrs[1]);
> +		}
> +		printf("\n");
> +	}
> +
> +	/*
> +	 * Walk the DRAM Socket Configuration Table to see what is installed.
> +	 */
> +	for (didx = 0; didx < DDR_CFG_T_MAX_DIMMS; ++didx) {
> +		/* Check for lower DIMM socket populated */
> +		if (validate_dimm(priv, &dimm_config_table[didx], 0)) {
> +			if (ddr_verbose(priv))
> +				report_dimm(&dimm_config_table[didx], 0,
> +					    dimm_count, if_num);
> +			++dimm_count;
> +		} else {
> +			break;
> +		}		/* Finished when there is no lower DIMM */
> +	}
> +
> +	initialize_ddr_clock(priv, ddr_conf, cpu_hertz, ddr_hertz,
> +			     ddr_ref_hertz, if_num, if_mask);
> +
> +	if (!odt_1rank_config)
> +		odt_1rank_config = disable_odt_config;
> +	if (!odt_2rank_config)
> +		odt_2rank_config = disable_odt_config;
> +	if (!odt_4rank_config)
> +		odt_4rank_config = disable_odt_config;
> +
> +	s = env_get("ddr_safe");
> +	if (s) {
> +		safe_ddr_flag = !!simple_strtoul(s, NULL, 0);
> +		printf("Parameter found in environment. ddr_safe = %d\n",
> +		       safe_ddr_flag);
> +	}
> +
> +	if (dimm_count == 0) {
> +		printf("ERROR: DIMM 0 not detected.\n");
> +		return (-1);
> +	}
> +
> +	if (c_cfg->mode32b)
> +		if_64b = 0;
> +
> +	s = lookup_env(priv, "if_64b");
> +	if (s)
> +		if_64b = !!simple_strtoul(s, NULL, 0);
> +
> +	if (if_64b == 1) {
> +		if (octeon_is_cpuid(OCTEON_CN70XX)) {
> +			printf("64-bit interface width is not supported for this Octeon model\n");
> +			++fatal_error;
> +		}
> +	}
> +
> +	/* ddr_type only indicates DDR4 or DDR3 */
> +	ddr_type = (read_spd(&dimm_config_table[0], 0,
> +			     DDR4_SPD_KEY_BYTE_DEVICE_TYPE) == 0x0C) ? 4 : 3;
> +	debug("DRAM Device Type: DDR%d\n", ddr_type);
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		int spd_module_type;
> +		int asymmetric;
> +		const char *signal_load[4] = { "", "MLS", "3DS", "RSV" };
> +
> +		imp_val = &ddr4_impedence_val;
> +
> +		spd_addr =
> +		    read_spd(&dimm_config_table[0], 0,
> +			     DDR4_SPD_ADDRESSING_ROW_COL_BITS);
> +		spd_org =
> +		    read_spd(&dimm_config_table[0], 0,
> +			     DDR4_SPD_MODULE_ORGANIZATION);
> +		spd_banks =
> +		    0xFF & read_spd(&dimm_config_table[0], 0,
> +				    DDR4_SPD_DENSITY_BANKS);
> +
> +		bank_bits =
> +		    (2 + ((spd_banks >> 4) & 0x3)) + ((spd_banks >> 6) & 0x3);
> +		/* Controller can only address 4 bits. */
> +		bank_bits = min((int)bank_bits, 4);
> +
> +		spd_package =
> +		    0XFF & read_spd(&dimm_config_table[0], 0,
> +				    DDR4_SPD_PACKAGE_TYPE);
> +		if (spd_package & 0x80) {	// non-monolithic device
> +			is_stacked_die = ((spd_package & 0x73) == 0x11);
> +			debug("DDR4: Package Type 0x%02x (%s), %d die\n",
> +			      spd_package, signal_load[(spd_package & 3)],
> +			      ((spd_package >> 4) & 7) + 1);
> +			is_3ds_dimm = ((spd_package & 3) == 2);	// is it 3DS?
> +			if (is_3ds_dimm) {	// is it 3DS?
> +				lranks_per_prank = ((spd_package >> 4) & 7) + 1;
> +				// FIXME: should make sure it is only 2H or 4H
> +				// or 8H?
> +				lranks_bits = lranks_per_prank >> 1;
> +				if (lranks_bits == 4)
> +					lranks_bits = 3;
> +			}
> +		} else if (spd_package != 0) {
> +			// FIXME: print non-zero monolithic device definition
> +			debug("DDR4: Package Type MONOLITHIC: %d die, signal load %d\n",
> +			      ((spd_package >> 4) & 7) + 1, (spd_package & 3));
> +		}
> +
> +		asymmetric = (spd_org >> 6) & 1;
> +		if (asymmetric) {
> +			int spd_secondary_pkg =
> +			    read_spd(&dimm_config_table[0], 0,
> +				     DDR4_SPD_SECONDARY_PACKAGE_TYPE);
> +			debug("DDR4: Module Organization: ASYMMETRICAL: Secondary Package Type 0x%02x\n",
> +			      spd_secondary_pkg);
> +		} else {
> +			u64 bus_width =
> +				8 << (0x07 &
> +				read_spd(&dimm_config_table[0], 0,
> +					 DDR4_SPD_MODULE_MEMORY_BUS_WIDTH));
> +			u64 ddr_width = 4 << ((spd_org >> 0) & 0x7);
> +			u64 module_cap;
> +			int shift = (spd_banks & 0x0F);
> +
> +			die_capacity = (shift < 8) ? (256UL << shift) :
> +				((12UL << (shift & 1)) << 10);
> +			debug("DDR4: Module Organization: SYMMETRICAL: capacity per die %d %cbit\n",
> +			      (die_capacity > 512) ? (die_capacity >> 10) :
> +			      die_capacity, (die_capacity > 512) ? 'G' : 'M');
> +			module_cap = ((u64)die_capacity << 20) / 8UL *
> +				bus_width / ddr_width *
> +				(1UL + ((spd_org >> 3) & 0x7));
> +
> +			// is it 3DS?
> +			if (is_3ds_dimm) {
> +				module_cap *= (u64)(((spd_package >> 4) & 7) +
> +						    1);
> +			}
> +			debug("DDR4: Module Organization: SYMMETRICAL: capacity per module %lld GB\n",
> +			      module_cap >> 30);
> +		}
> +
> +		spd_rawcard =
> +		    0xFF & read_spd(&dimm_config_table[0], 0,
> +				    DDR4_SPD_REFERENCE_RAW_CARD);
> +		debug("DDR4: Reference Raw Card 0x%02x\n", spd_rawcard);
> +
> +		spd_module_type =
> +		    read_spd(&dimm_config_table[0], 0,
> +			     DDR4_SPD_KEY_BYTE_MODULE_TYPE);
> +		if (spd_module_type & 0x80) {	// HYBRID module
> +			debug("DDR4: HYBRID module, type %s\n",
> +			      ((spd_module_type & 0x70) ==
> +			       0x10) ? "NVDIMM" : "UNKNOWN");
> +		}
> +		spd_thermal_sensor =
> +		    read_spd(&dimm_config_table[0], 0,
> +			     DDR4_SPD_MODULE_THERMAL_SENSOR);
> +		spd_dimm_type = spd_module_type & 0x0F;
> +		spd_rdimm = (spd_dimm_type == 1) || (spd_dimm_type == 5) ||
> +			(spd_dimm_type == 8);
> +		if (spd_rdimm) {
> +			u16 spd_mfgr_id, spd_register_rev, spd_mod_attr;
> +			static const u16 manu_ids[4] = {
> +				0xb380, 0x3286, 0x9780, 0xb304
> +			};
> +			static const char *manu_names[4] = {
> +				"XXX", "XXXXXXX", "XX", "XXXXX"
> +			};
> +			int mc;
> +
> +			spd_mfgr_id =
> +			    (0xFFU &
> +			     read_spd(&dimm_config_table[0], 0,
> +				      DDR4_SPD_REGISTER_MANUFACTURER_ID_LSB)) |
> +			    ((0xFFU &
> +			      read_spd(&dimm_config_table[0], 0,
> +				       DDR4_SPD_REGISTER_MANUFACTURER_ID_MSB))
> +			     << 8);
> +			spd_register_rev =
> +			    0xFFU & read_spd(&dimm_config_table[0], 0,
> +					     DDR4_SPD_REGISTER_REVISION_NUMBER);
> +			for (mc = 0; mc < 4; mc++)
> +				if (manu_ids[mc] == spd_mfgr_id)
> +					break;
> +
> +			debug("DDR4: RDIMM Register Manufacturer ID: %s, Revision: 0x%02x\n",
> +			      (mc >= 4) ? "UNKNOWN" : manu_names[mc],
> +			      spd_register_rev);
> +
> +			// RAWCARD A or B must be bit 7=0 and bits 4-0
> +			// either 00000(A) or 00001(B)
> +			spd_rawcard_aorb = ((spd_rawcard & 0x9fUL) <= 1);
> +			// RDIMM Module Attributes
> +			spd_mod_attr =
> +			    0xFFU & read_spd(&dimm_config_table[0], 0,
> +					DDR4_SPD_UDIMM_ADDR_MAPPING_FROM_EDGE);
> +			spd_rdimm_registers = ((1 << (spd_mod_attr & 3)) >> 1);
> +			debug("DDR4: RDIMM Module Attributes (0x%02x): Register Type DDR4RCD%02d, DRAM rows %d, Registers %d\n",
> +			      spd_mod_attr, (spd_mod_attr >> 4) + 1,
> +			      ((1 << ((spd_mod_attr >> 2) & 3)) >> 1),
> +			      spd_rdimm_registers);
> +		}
> +		dimm_type_name = ddr4_dimm_types[spd_dimm_type];
> +	} else {		/* if (ddr_type == DDR4_DRAM) */
> +		const char *signal_load[4] = { "UNK", "MLS", "SLS", "RSV" };
> +
> +		imp_val = &ddr3_impedence_val;
> +
> +		spd_addr =
> +		    read_spd(&dimm_config_table[0], 0,
> +			     DDR3_SPD_ADDRESSING_ROW_COL_BITS);
> +		spd_org =
> +		    read_spd(&dimm_config_table[0], 0,
> +			     DDR3_SPD_MODULE_ORGANIZATION);
> +		spd_banks =
> +		    read_spd(&dimm_config_table[0], 0,
> +			     DDR3_SPD_DENSITY_BANKS) & 0xff;
> +
> +		bank_bits = 3 + ((spd_banks >> 4) & 0x7);
> +		/* Controller can only address 3 bits. */
> +		bank_bits = min((int)bank_bits, 3);
> +		spd_dimm_type =
> +		    0x0f & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_KEY_BYTE_MODULE_TYPE);
> +		spd_rdimm = (spd_dimm_type == 1) || (spd_dimm_type == 5) ||
> +			(spd_dimm_type == 9);
> +
> +		spd_package =
> +		    0xFF & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_SDRAM_DEVICE_TYPE);
> +		if (spd_package & 0x80) {	// non-standard device
> +			debug("DDR3: Device Type 0x%02x (%s), %d die\n",
> +			      spd_package, signal_load[(spd_package & 3)],
> +			      ((1 << ((spd_package >> 4) & 7)) >> 1));
> +		} else if (spd_package != 0) {
> +			// FIXME: print non-zero monolithic device definition
> +			debug("DDR3: Device Type MONOLITHIC: %d die, signal load %d\n",
> +			      ((1 << (spd_package >> 4) & 7) >> 1),
> +			      (spd_package & 3));
> +		}
> +
> +		spd_rawcard =
> +		    0xFF & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_REFERENCE_RAW_CARD);
> +		debug("DDR3: Reference Raw Card 0x%02x\n", spd_rawcard);
> +		spd_thermal_sensor =
> +		    read_spd(&dimm_config_table[0], 0,
> +			     DDR3_SPD_MODULE_THERMAL_SENSOR);
> +
> +		if (spd_rdimm) {
> +			int spd_mfgr_id, spd_register_rev, spd_mod_attr;
> +
> +			spd_mfgr_id =
> +			    (0xFFU &
> +			     read_spd(&dimm_config_table[0], 0,
> +				      DDR3_SPD_REGISTER_MANUFACTURER_ID_LSB)) |
> +			    ((0xFFU &
> +			      read_spd(&dimm_config_table[0], 0,
> +				       DDR3_SPD_REGISTER_MANUFACTURER_ID_MSB))
> +			     << 8);
> +			spd_register_rev =
> +			    0xFFU & read_spd(&dimm_config_table[0], 0,
> +					     DDR3_SPD_REGISTER_REVISION_NUMBER);
> +			debug("DDR3: RDIMM Register Manufacturer ID 0x%x Revision 0x%02x\n",
> +			      spd_mfgr_id, spd_register_rev);
> +			// Module Attributes
> +			spd_mod_attr =
> +			    0xFFU & read_spd(&dimm_config_table[0], 0,
> +					     DDR3_SPD_ADDRESS_MAPPING);
> +			spd_rdimm_registers = ((1 << (spd_mod_attr & 3)) >> 1);
> +			debug("DDR3: RDIMM Module Attributes (0x%02x): DRAM rows %d, Registers %d\n",
> +			      spd_mod_attr,
> +			      ((1 << ((spd_mod_attr >> 2) & 3)) >> 1),
> +			      spd_rdimm_registers);
> +		}
> +		dimm_type_name = ddr3_dimm_types[spd_dimm_type];
> +	}
> +
> +	if (spd_thermal_sensor & 0x80) {
> +		debug("DDR%d: SPD: Thermal Sensor PRESENT\n",
> +		      (ddr_type == DDR4_DRAM) ? 4 : 3);
> +	}
> +
> +	debug("spd_addr        : %#06x\n", spd_addr);
> +	debug("spd_org         : %#06x\n", spd_org);
> +	debug("spd_banks       : %#06x\n", spd_banks);
> +
> +	row_bits = 12 + ((spd_addr >> 3) & 0x7);
> +	col_bits = 9 + ((spd_addr >> 0) & 0x7);
> +
> +	num_ranks = 1 + ((spd_org >> 3) & 0x7);
> +	dram_width = 4 << ((spd_org >> 0) & 0x7);
> +	num_banks = 1 << bank_bits;
> +
> +	s = lookup_env(priv, "ddr_num_ranks");
> +	if (s)
> +		num_ranks = simple_strtoul(s, NULL, 0);
> +
> +#if ALLOW_BY_RANK_INIT
> +	s = lookup_env(priv, "ddr_enable_by_rank_init");
> +	if (s)
> +		enable_by_rank_init = !!simple_strtoul(s, NULL, 0);
> +
> +	// FIXME: for now, we can only handle a DDR4 2rank-1slot config
> +	// FIXME: also, by-rank init does not work correctly if 32-bit mode...
> +	if (enable_by_rank_init && (ddr_type != DDR4_DRAM ||
> +				    dimm_count != 1 || if_64b != 1 ||
> +				    num_ranks != 2))
> +		enable_by_rank_init = 0;
> +
> +	if (enable_by_rank_init) {
> +		struct dimm_odt_config *odt_config;
> +		union cvmx_lmcx_modereg_params1 mp1;
> +		union cvmx_lmcx_modereg_params2 modereg_params2;
> +		int by_rank_rodt, by_rank_wr, by_rank_park;
> +
> +		// Do ODT settings changes which work best for 2R-1S configs
> +		debug("DDR4: 2R-1S special BY-RANK init ODT settings updated\n");
> +
> +		// setup for modifying config table values - 2 ranks and 1 DIMM
> +		odt_config =
> +		    (struct dimm_odt_config *)&ddr_conf->odt_2rank_config[0];
> +
> +		// original was 80, first try was 60
> +		by_rank_rodt = ddr4_rodt_ctl_48_ohm;
> +		s = lookup_env(priv, "ddr_by_rank_rodt");
> +		if (s)
> +			by_rank_rodt = strtoul(s, NULL, 0);
> +
> +		odt_config->qs_dic = /*RODT_CTL */ by_rank_rodt;
> +
> +		// this is for MODEREG_PARAMS1 fields
> +		// fetch the original settings
> +		mp1.u64 = odt_config->modereg_params1.u64;
> +
> +		by_rank_wr = ddr4_rttwr_80ohm;	// originals were 240
> +		s = lookup_env(priv, "ddr_by_rank_wr");
> +		if (s)
> +			by_rank_wr = simple_strtoul(s, NULL, 0);
> +
> +		// change specific settings here...
> +		insrt_wr(&mp1.u64, /*rank */ 00, by_rank_wr);
> +		insrt_wr(&mp1.u64, /*rank */ 01, by_rank_wr);
> +
> +		// save final settings
> +		odt_config->modereg_params1.u64 = mp1.u64;
> +
> +		// this is for MODEREG_PARAMS2 fields
> +		// fetch the original settings
> +		modereg_params2.u64 = odt_config->modereg_params2.u64;
> +
> +		by_rank_park = ddr4_rttpark_none;	// originals were 120
> +		s = lookup_env(priv, "ddr_by_rank_park");
> +		if (s)
> +			by_rank_park = simple_strtoul(s, NULL, 0);
> +
> +		// change specific settings here...
> +		modereg_params2.s.rtt_park_00 = by_rank_park;
> +		modereg_params2.s.rtt_park_01 = by_rank_park;
> +
> +		// save final settings
> +		odt_config->modereg_params2.u64 = modereg_params2.u64;
> +	}
> +#endif /* ALLOW_BY_RANK_INIT */
> +
> +	/*
> +	 * FIX
> +	 * Check that values are within some theoretical limits.
> +	 * col_bits(min) = row_lsb(min) - bank_bits(max) - bus_bits(max) =
> +	 *   14 - 3 - 4 = 7
> +	 * col_bits(max) = row_lsb(max) - bank_bits(min) - bus_bits(min) =
> +	 *   18 - 2 - 3 = 13
> +	 */
> +	if (col_bits > 13 || col_bits < 7) {
> +		printf("Unsupported number of Col Bits: %d\n", col_bits);
> +		++fatal_error;
> +	}
> +
> +	/*
> +	 * FIX
> +	 * Check that values are within some theoretical limits.
> +	 * row_bits(min) = pbank_lsb(min) - row_lsb(max) - rank_bits =
> +	 *   26 - 18 - 1 = 7
> +	 * row_bits(max) = pbank_lsb(max) - row_lsb(min) - rank_bits =
> +	 *   33 - 14 - 1 = 18
> +	 */
> +	if (row_bits > 18 || row_bits < 7) {
> +		printf("Unsupported number of Row Bits: %d\n", row_bits);
> +		++fatal_error;
> +	}
> +
> +	s = lookup_env(priv, "ddr_rdimm_ena");
> +	if (s)
> +		spd_rdimm = !!simple_strtoul(s, NULL, 0);
> +
> +	wl_loops = WLEVEL_LOOPS_DEFAULT;
> +	// accept generic or interface-specific override
> +	s = lookup_env(priv, "ddr_wlevel_loops");
> +	if (!s)
> +		s = lookup_env(priv, "ddr%d_wlevel_loops", if_num);
> +
> +	if (s)
> +		wl_loops = strtoul(s, NULL, 0);
> +
> +	s = lookup_env(priv, "ddr_ranks");
> +	if (s)
> +		num_ranks = simple_strtoul(s, NULL, 0);
> +
> +	bunk_enable = (num_ranks > 1);
> +
> +	if (octeon_is_cpuid(OCTEON_CN7XXX))
> +		column_bits_start = 3;
> +	else
> +		printf("ERROR: Unsupported Octeon model: 0x%x\n",
> +		       read_c0_prid());
> +
> +	row_lsb = column_bits_start + col_bits + bank_bits - (!if_64b);
> +	debug("row_lsb = column_bits_start + col_bits + bank_bits = %d\n",
> +	      row_lsb);
> +
> +	pbank_lsb = row_lsb + row_bits + bunk_enable;
> +	debug("pbank_lsb = row_lsb + row_bits + bunk_enable = %d\n", pbank_lsb);
> +
> +	if (lranks_per_prank > 1) {
> +		pbank_lsb = row_lsb + row_bits + lranks_bits + bunk_enable;
> +		debug("DDR4: 3DS: pbank_lsb = (%d row_lsb) + (%d row_bits) + (%d lranks_bits) + (%d bunk_enable) = %d\n",
> +		      row_lsb, row_bits, lranks_bits, bunk_enable, pbank_lsb);
> +	}
> +
> +	mem_size_mbytes = dimm_count * ((1ull << pbank_lsb) >> 20);
> +	if (num_ranks == 4) {
> +		/*
> +		 * Quad rank dimm capacity is equivalent to two dual-rank
> +		 * dimms.
> +		 */
> +		mem_size_mbytes *= 2;
> +	}
> +
> +	/*
> +	 * Mask with 1 bits set for for each active rank, allowing 2 bits
> +	 * per dimm. This makes later calculations simpler, as a variety
> +	 * of CSRs use this layout. This init needs to be updated for dual
> +	 * configs (ie non-identical DIMMs).
> +	 *
> +	 * Bit 0 = dimm0, rank 0
> +	 * Bit 1 = dimm0, rank 1
> +	 * Bit 2 = dimm1, rank 0
> +	 * Bit 3 = dimm1, rank 1
> +	 * ...
> +	 */
> +	rank_mask = 0x1;
> +	if (num_ranks > 1)
> +		rank_mask = 0x3;
> +	if (num_ranks > 2)
> +		rank_mask = 0xf;
> +
> +	for (i = 1; i < dimm_count; i++)
> +		rank_mask |= ((rank_mask & 0x3) << (2 * i));
> +
> +	/*
> +	 * If we are booting from RAM, the DRAM controller is
> +	 * already set up.  Just return the memory size
> +	 */
> +	if (priv->flags & FLAG_RAM_RESIDENT) {
> +		debug("Ram Boot: Skipping LMC config\n");
> +		return mem_size_mbytes;
> +	}
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		spd_ecc =
> +		    !!(read_spd
> +		       (&dimm_config_table[0], 0,
> +			DDR4_SPD_MODULE_MEMORY_BUS_WIDTH) & 8);
> +	} else {
> +		spd_ecc =
> +		    !!(read_spd
> +		       (&dimm_config_table[0], 0,
> +			DDR3_SPD_MEMORY_BUS_WIDTH) & 8);
> +	}
> +
> +	char rank_spec[8];
> +
> +	printable_rank_spec(rank_spec, num_ranks, dram_width, spd_package);
> +	debug("Summary: %d %s%s %s %s, row bits=%d, col bits=%d, bank bits=%d\n",
> +	      dimm_count, dimm_type_name, (dimm_count > 1) ? "s" : "",
> +	      rank_spec,
> +	      (spd_ecc) ? "ECC" : "non-ECC", row_bits, col_bits, bank_bits);
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		spd_cas_latency =
> +		    ((0xff &
> +		      read_spd(&dimm_config_table[0], 0,
> +			       DDR4_SPD_CAS_LATENCIES_BYTE0)) << 0);
> +		spd_cas_latency |=
> +		    ((0xff &
> +		      read_spd(&dimm_config_table[0], 0,
> +			       DDR4_SPD_CAS_LATENCIES_BYTE1)) << 8);
> +		spd_cas_latency |=
> +		    ((0xff &
> +		      read_spd(&dimm_config_table[0], 0,
> +			       DDR4_SPD_CAS_LATENCIES_BYTE2)) << 16);
> +		spd_cas_latency |=
> +		    ((0xff &
> +		      read_spd(&dimm_config_table[0], 0,
> +			       DDR4_SPD_CAS_LATENCIES_BYTE3)) << 24);
> +	} else {
> +		spd_cas_latency =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_CAS_LATENCIES_LSB);
> +		spd_cas_latency |=
> +		    ((0xff &
> +		      read_spd(&dimm_config_table[0], 0,
> +			       DDR3_SPD_CAS_LATENCIES_MSB)) << 8);
> +	}
> +	debug("spd_cas_latency : %#06x\n", spd_cas_latency);
> +
> +	if (ddr_type == DDR4_DRAM) {
> +		/*
> +		 * No other values for DDR4 MTB and FTB are specified at the
> +		 * current time so don't bother reading them. Can't speculate
> +		 * how new values will be represented.
> +		 */
> +		int spdmtb = 125;
> +		int spdftb = 1;
> +
> +		taamin = spdmtb * read_spd(&dimm_config_table[0], 0,
> +					   DDR4_SPD_MIN_CAS_LATENCY_TAAMIN) +
> +			 spdftb * (signed char)read_spd(&dimm_config_table[0],
> +			 0, DDR4_SPD_MIN_CAS_LATENCY_FINE_TAAMIN);
> +
> +		ddr4_tckavgmin = spdmtb * read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MINIMUM_CYCLE_TIME_TCKAVGMIN) +
> +			spdftb * (signed char)read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_CYCLE_TIME_FINE_TCKAVGMIN);
> +
> +		ddr4_tckavgmax = spdmtb * read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MAXIMUM_CYCLE_TIME_TCKAVGMAX) +
> +			spdftb * (signed char)read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MAX_CYCLE_TIME_FINE_TCKAVGMAX);
> +
> +		ddr4_trdcmin = spdmtb * read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_RAS_CAS_DELAY_TRCDMIN) +
> +			spdftb * (signed char)read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_RAS_TO_CAS_DELAY_FINE_TRCDMIN);
> +
> +		ddr4_trpmin = spdmtb * read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_ROW_PRECHARGE_DELAY_TRPMIN) +
> +			spdftb * (signed char)read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_ROW_PRECHARGE_DELAY_FINE_TRPMIN);
> +
> +		ddr4_trasmin = spdmtb *
> +			(((read_spd
> +			   (&dimm_config_table[0], 0,
> +			    DDR4_SPD_UPPER_NIBBLES_TRAS_TRC) & 0xf) << 8) +
> +			 (read_spd
> +			  (&dimm_config_table[0], 0,
> +			   DDR4_SPD_MIN_ACTIVE_PRECHARGE_LSB_TRASMIN) & 0xff));
> +
> +		ddr4_trcmin = spdmtb *
> +			((((read_spd
> +			    (&dimm_config_table[0], 0,
> +			     DDR4_SPD_UPPER_NIBBLES_TRAS_TRC) >> 4) & 0xf) <<
> +			  8) + (read_spd
> +				(&dimm_config_table[0], 0,
> +				 DDR4_SPD_MIN_ACTIVE_REFRESH_LSB_TRCMIN) &
> +				0xff))
> +			+ spdftb * (signed char)read_spd(&dimm_config_table[0],
> +							 0,
> +			DDR4_SPD_MIN_ACT_TO_ACT_REFRESH_DELAY_FINE_TRCMIN);
> +
> +		ddr4_trfc1min = spdmtb * (((read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_REFRESH_RECOVERY_MSB_TRFC1MIN) & 0xff) <<
> +			8) + (read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_REFRESH_RECOVERY_LSB_TRFC1MIN) & 0xff));
> +
> +		ddr4_trfc2min = spdmtb * (((read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_REFRESH_RECOVERY_MSB_TRFC2MIN) & 0xff) <<
> +			8) + (read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_REFRESH_RECOVERY_LSB_TRFC2MIN) & 0xff));
> +
> +		ddr4_trfc4min = spdmtb * (((read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_REFRESH_RECOVERY_MSB_TRFC4MIN) & 0xff) <<
> +			8) + (read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_REFRESH_RECOVERY_LSB_TRFC4MIN) & 0xff));
> +
> +		ddr4_tfawmin = spdmtb * (((read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_FOUR_ACTIVE_WINDOW_MSN_TFAWMIN) & 0xf) <<
> +			8) + (read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_FOUR_ACTIVE_WINDOW_LSB_TFAWMIN) & 0xff));
> +
> +		ddr4_trrd_smin = spdmtb * read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_ROW_ACTIVE_DELAY_SAME_TRRD_SMIN) +
> +			spdftb * (signed char)read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_ACT_TO_ACT_DELAY_DIFF_FINE_TRRD_SMIN);
> +
> +		ddr4_trrd_lmin = spdmtb * read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_ROW_ACTIVE_DELAY_DIFF_TRRD_LMIN) +
> +			spdftb * (signed char)read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_ACT_TO_ACT_DELAY_SAME_FINE_TRRD_LMIN);
> +
> +		ddr4_tccd_lmin = spdmtb * read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_CAS_TO_CAS_DELAY_TCCD_LMIN) +
> +			spdftb * (signed char)read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_MIN_CAS_TO_CAS_DELAY_FINE_TCCD_LMIN);
> +
> +		debug("%-45s : %6d ps\n", "Medium Timebase (MTB)", spdmtb);
> +		debug("%-45s : %6d ps\n", "Fine Timebase   (FTB)", spdftb);
> +
> +		debug("%-45s : %6d ps (%ld MT/s)\n",
> +		      "SDRAM Minimum Cycle Time (tCKAVGmin)", ddr4_tckavgmin,
> +		      pretty_psecs_to_mts(ddr4_tckavgmin));
> +		debug("%-45s : %6d ps\n",
> +		      "SDRAM Maximum Cycle Time (tCKAVGmax)", ddr4_tckavgmax);
> +		debug("%-45s : %6d ps\n", "Minimum CAS Latency Time (taamin)",
> +		      taamin);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum RAS to CAS Delay Time (tRCDmin)", ddr4_trdcmin);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum Row Precharge Delay Time (tRPmin)", ddr4_trpmin);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum Active to Precharge Delay (tRASmin)",
> +		      ddr4_trasmin);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum Active to Active/Refr. Delay (tRCmin)",
> +		      ddr4_trcmin);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum Refresh Recovery Delay (tRFC1min)",
> +		      ddr4_trfc1min);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum Refresh Recovery Delay (tRFC2min)",
> +		      ddr4_trfc2min);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum Refresh Recovery Delay (tRFC4min)",
> +		      ddr4_trfc4min);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum Four Activate Window Time (tFAWmin)",
> +		      ddr4_tfawmin);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum Act. to Act. Delay (tRRD_Smin)", ddr4_trrd_smin);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum Act. to Act. Delay (tRRD_Lmin)", ddr4_trrd_lmin);
> +		debug("%-45s : %6d ps\n",
> +		      "Minimum CAS to CAS Delay Time (tCCD_Lmin)",
> +		      ddr4_tccd_lmin);
> +
> +#define DDR4_TWR 15000
> +#define DDR4_TWTR_S 2500
> +
> +		tckmin = ddr4_tckavgmin;
> +		twr = DDR4_TWR;
> +		trcd = ddr4_trdcmin;
> +		trrd = ddr4_trrd_smin;
> +		trp = ddr4_trpmin;
> +		tras = ddr4_trasmin;
> +		trc = ddr4_trcmin;
> +		trfc = ddr4_trfc1min;
> +		twtr = DDR4_TWTR_S;
> +		tfaw = ddr4_tfawmin;
> +
> +		if (spd_rdimm) {
> +			spd_addr_mirror = read_spd(&dimm_config_table[0], 0,
> +			DDR4_SPD_RDIMM_ADDR_MAPPING_FROM_REGISTER_TO_DRAM) &
> +			0x1;
> +		} else {
> +			spd_addr_mirror = read_spd(&dimm_config_table[0], 0,
> +				DDR4_SPD_UDIMM_ADDR_MAPPING_FROM_EDGE) & 0x1;
> +		}
> +		debug("spd_addr_mirror : %#06x\n", spd_addr_mirror);
> +	} else {
> +		spd_mtb_dividend =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MEDIUM_TIMEBASE_DIVIDEND);
> +		spd_mtb_divisor =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MEDIUM_TIMEBASE_DIVISOR);
> +		spd_tck_min =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MINIMUM_CYCLE_TIME_TCKMIN);
> +		spd_taa_min =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MIN_CAS_LATENCY_TAAMIN);
> +
> +		spd_twr =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MIN_WRITE_RECOVERY_TWRMIN);
> +		spd_trcd =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MIN_RAS_CAS_DELAY_TRCDMIN);
> +		spd_trrd =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MIN_ROW_ACTIVE_DELAY_TRRDMIN);
> +		spd_trp =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MIN_ROW_PRECHARGE_DELAY_TRPMIN);
> +		spd_tras =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MIN_ACTIVE_PRECHARGE_LSB_TRASMIN);
> +		spd_tras |=
> +		    ((0xff &
> +		      read_spd(&dimm_config_table[0], 0,
> +			       DDR3_SPD_UPPER_NIBBLES_TRAS_TRC) & 0xf) << 8);
> +		spd_trc =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MIN_ACTIVE_REFRESH_LSB_TRCMIN);
> +		spd_trc |=
> +		    ((0xff &
> +		      read_spd(&dimm_config_table[0], 0,
> +			       DDR3_SPD_UPPER_NIBBLES_TRAS_TRC) & 0xf0) << 4);
> +		spd_trfc =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MIN_REFRESH_RECOVERY_LSB_TRFCMIN);
> +		spd_trfc |=
> +		    ((0xff &
> +		      read_spd(&dimm_config_table[0], 0,
> +			       DDR3_SPD_MIN_REFRESH_RECOVERY_MSB_TRFCMIN)) <<
> +		     8);
> +		spd_twtr =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				DDR3_SPD_MIN_INTERNAL_WRITE_READ_CMD_TWTRMIN);
> +		spd_trtp =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +			DDR3_SPD_MIN_INTERNAL_READ_PRECHARGE_CMD_TRTPMIN);
> +		spd_tfaw =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_MIN_FOUR_ACTIVE_WINDOW_TFAWMIN);
> +		spd_tfaw |=
> +		    ((0xff &
> +		      read_spd(&dimm_config_table[0], 0,
> +			       DDR3_SPD_UPPER_NIBBLE_TFAW) & 0xf) << 8);
> +		spd_addr_mirror =
> +		    0xff & read_spd(&dimm_config_table[0], 0,
> +				    DDR3_SPD_ADDRESS_MAPPING) & 0x1;
> +		/* Only address mirror unbuffered dimms.  */
> +		spd_addr_mirror = spd_addr_mirror && !spd_rdimm;
> +		ftb_dividend =
> +		    read_spd(&dimm_config_table[0], 0,
> +			     DDR3_SPD_FINE_TIMEBASE_DIVIDEND_DIVISOR) >> 4;
> +		ftb_divisor =
> +		    read_spd(&dimm_config_table[0], 0,
> +			     DDR3_SPD_FINE_TIMEBASE_DIVIDEND_DIVISOR) & 0xf;
> +		/* Make sure that it is not 0 */
> +		ftb_divisor = (ftb_divisor == 0) ? 1 : ftb_divisor;
> +
> +		debug("spd_twr         : %#06x\n", spd_twr);
> +		debug("spd_trcd        : %#06x\n", spd_trcd);
> +		debug("spd_trrd        : %#06x\n", spd_trrd);
> +		debug("spd_trp         : %#06x\n", spd_trp);
> +		debug("spd_tras        : %#06x\n", spd_tras);
> +		debug("spd_trc         : %#06x\n", spd_trc);
> +		debug("spd_trfc        : %#06x\n", spd_trfc);
> +		debug("spd_twtr        : %#06x\n", spd_twtr);
> +		debug("spd_trtp        : %#06x\n", spd_trtp);
> +		debug("spd_tfaw        : %#06x\n", spd_tfaw);
> +		debug("spd_addr_mirror : %#06x\n", spd_addr_mirror);
> +
> +		mtb_psec = spd_mtb_dividend * 1000 / spd_mtb_divisor;
> +		taamin = mtb_psec * spd_taa_min;
> +		taamin += ftb_dividend *
> +			(signed char)read_spd(&dimm_config_table[0],
> +				0, DDR3_SPD_MIN_CAS_LATENCY_FINE_TAAMIN) /
> +			ftb_divisor;
> +		tckmin = mtb_psec * spd_tck_min;
> +		tckmin += ftb_dividend *
> +			(signed char)read_spd(&dimm_config_table[0],
> +				0, DDR3_SPD_MINIMUM_CYCLE_TIME_FINE_TCKMIN) /
> +			ftb_divisor;
> +
> +		twr = spd_twr * mtb_psec;
> +		trcd = spd_trcd * mtb_psec;
> +		trrd = spd_trrd * mtb_psec;
> +		trp = spd_trp * mtb_psec;
> +		tras = spd_tras * mtb_psec;
> +		trc = spd_trc * mtb_psec;
> +		trfc = spd_trfc * mtb_psec;
> +		if (octeon_is_cpuid(OCTEON_CN78XX_PASS2_X) && trfc < 260000) {
> +			// default to this - because it works...
> +			int new_trfc = 260000;
> +
> +			s = env_get("ddr_trfc");
> +			if (s) {
> +				new_trfc = simple_strtoul(s, NULL, 0);
> +				printf("Parameter found in environment. ddr_trfc = %d\n",
> +				       new_trfc);
> +				if (new_trfc < 160000 || new_trfc > 260000) {
> +					// back to default if out of range
> +					new_trfc = 260000;
> +				}
> +			}
> +			debug("N%d.LMC%d: Adjusting tRFC from %d to %d, for CN78XX Pass 2.x\n",
> +			      node, if_num, trfc, new_trfc);
> +			trfc = new_trfc;
> +		}
> +
> +		twtr = spd_twtr * mtb_psec;
> +		trtp = spd_trtp * mtb_psec;
> +		tfaw = spd_tfaw * mtb_psec;
> +
> +		debug("Medium Timebase (MTB)                         : %6d ps\n",
> +		      mtb_psec);
> +		debug("Minimum Cycle Time (tckmin)                   : %6d ps (%ld MT/s)\n",
> +		      tckmin, pretty_psecs_to_mts(tckmin));
> +		debug("Minimum CAS Latency Time (taamin)             : %6d ps\n",
> +		      taamin);
> +		debug("Write Recovery Time (tWR)                     : %6d ps\n",
> +		      twr);
> +		debug("Minimum RAS to CAS delay (tRCD)               : %6d ps\n",
> +		      trcd);
> +		debug("Minimum Row Active to Row Active delay (tRRD) : %6d ps\n",
> +		      trrd);
> +		debug("Minimum Row Precharge Delay (tRP)             : %6d ps\n",
> +		      trp);
> +		debug("Minimum Active to Precharge (tRAS)            : %6d ps\n",
> +		      tras);
> +		debug("Minimum Active to Active/Refresh Delay (tRC)  : %6d ps\n",
> +		      trc);
> +		debug("Minimum Refresh Recovery Delay (tRFC)         : %6d ps\n",
> +		      trfc);
> +		debug("Internal write to read command delay (tWTR)   : %6d ps\n",
> +		      twtr);
> +		debug("Min Internal Rd to Precharge Cmd Delay (tRTP) : %6d ps\n",
> +		      trtp);
> +		debug("Minimum Four Activate Window Delay (tFAW)     : %6d ps\n",
> +		      tfaw);
> +	}
> +
> +	/*
> +	 * When the cycle time is within 1 psec of the minimum accept it
> +	 * as a slight rounding error and adjust it to exactly the minimum
> +	 * cycle time. This avoids an unnecessary warning.
> +	 */
> +	if (abs(tclk_psecs - tckmin) < 2)
> +		tclk_psecs = tckmin;
> +
> +	if (tclk_psecs < (u64)tckmin) {
> +		printf("WARNING!!!!: DDR Clock Rate (tCLK: %ld) exceeds DIMM specifications (tckmin: %ld)!!!!\n",
> +		       tclk_psecs, (ulong)tckmin);
> +	}
> +
> +	debug("DDR Clock Rate (tCLK)                         : %6ld ps\n",
> +	      tclk_psecs);
> +	debug("Core Clock Rate (eCLK)                        : %6ld ps\n",
> +	      eclk_psecs);
> +
> +	s = env_get("ddr_use_ecc");
> +	if (s) {
> +		use_ecc = !!simple_strtoul(s, NULL, 0);
> +		printf("Parameter found in environment. ddr_use_ecc = %d\n",
> +		       use_ecc);
> +	}
> +	use_ecc = use_ecc && spd_ecc;
> +
> +	if_bytemask = if_64b ? (use_ecc ? 0x1ff : 0xff)
> +	    : (use_ecc ? 0x01f : 0x0f);
> +
> +	debug("DRAM Interface width: %d bits %s bytemask 0x%03x\n",
> +	      if_64b ? 64 : 32, use_ecc ? "+ECC" : "", if_bytemask);
> +
> +	debug("\n------ Board Custom Configuration Settings ------\n");
> +	debug("%-45s : %d\n", "MIN_RTT_NOM_IDX   ", c_cfg->min_rtt_nom_idx);
> +	debug("%-45s : %d\n", "MAX_RTT_NOM_IDX   ", c_cfg->max_rtt_nom_idx);
> +	debug("%-45s : %d\n", "MIN_RODT_CTL      ", c_cfg->min_rodt_ctl);
> +	debug("%-45s : %d\n", "MAX_RODT_CTL      ", c_cfg->max_rodt_ctl);
> +	debug("%-45s : %d\n", "MIN_CAS_LATENCY   ", c_cfg->min_cas_latency);
> +	debug("%-45s : %d\n", "OFFSET_EN         ", c_cfg->offset_en);
> +	debug("%-45s : %d\n", "OFFSET_UDIMM      ", c_cfg->offset_udimm);
> +	debug("%-45s : %d\n", "OFFSET_RDIMM      ", c_cfg->offset_rdimm);
> +	debug("%-45s : %d\n", "DDR_RTT_NOM_AUTO  ", c_cfg->ddr_rtt_nom_auto);
> +	debug("%-45s : %d\n", "DDR_RODT_CTL_AUTO ", c_cfg->ddr_rodt_ctl_auto);
> +	if (spd_rdimm)
> +		debug("%-45s : %d\n", "RLEVEL_COMP_OFFSET",
> +		      c_cfg->rlevel_comp_offset_rdimm);
> +	else
> +		debug("%-45s : %d\n", "RLEVEL_COMP_OFFSET",
> +		      c_cfg->rlevel_comp_offset_udimm);
> +	debug("%-45s : %d\n", "RLEVEL_COMPUTE    ", c_cfg->rlevel_compute);
> +	debug("%-45s : %d\n", "DDR2T_UDIMM       ", c_cfg->ddr2t_udimm);
> +	debug("%-45s : %d\n", "DDR2T_RDIMM       ", c_cfg->ddr2t_rdimm);
> +	debug("%-45s : %d\n", "FPRCH2            ", c_cfg->fprch2);
> +	debug("%-45s : %d\n", "PTUNE_OFFSET      ", c_cfg->ptune_offset);
> +	debug("%-45s : %d\n", "NTUNE_OFFSET      ", c_cfg->ntune_offset);
> +	debug("-------------------------------------------------\n");
> +
> +	cl = divide_roundup(taamin, tclk_psecs);
> +
> +	debug("Desired CAS Latency                           : %6d\n", cl);
> +
> +	min_cas_latency = c_cfg->min_cas_latency;
> +
> +	s = lookup_env(priv, "ddr_min_cas_latency");
> +	if (s)
> +		min_cas_latency = simple_strtoul(s, NULL, 0);
> +
> +	debug("CAS Latencies supported in DIMM               :");
> +	base_cl = (ddr_type == DDR4_DRAM) ? 7 : 4;
> +	for (i = 0; i < 32; ++i) {
> +		if ((spd_cas_latency >> i) & 1) {
> +			debug(" %d", i + base_cl);
> +			max_cas_latency = i + base_cl;
> +			if (min_cas_latency == 0)
> +				min_cas_latency = i + base_cl;
> +		}
> +	}
> +	debug("\n");
> +
> +	/*
> +	 * Use relaxed timing when running slower than the minimum
> +	 * supported speed.  Adjust timing to match the smallest supported
> +	 * CAS Latency.
> +	 */
> +	if (min_cas_latency > cl) {
> +		ulong adjusted_tclk = taamin / min_cas_latency;
> +
> +		cl = min_cas_latency;
> +		debug("Slow clock speed. Adjusting timing: tClk = %ld, Adjusted tClk = %ld\n",
> +		      tclk_psecs, adjusted_tclk);
> +		tclk_psecs = adjusted_tclk;
> +	}
> +
> +	s = env_get("ddr_cas_latency");
> +	if (s) {
> +		override_cas_latency = simple_strtoul(s, NULL, 0);
> +		printf("Parameter found in environment. ddr_cas_latency = %d\n",
> +		       override_cas_latency);
> +	}
> +
> +	/* Make sure that the selected cas latency is legal */
> +	for (i = (cl - base_cl); i < 32; ++i) {
> +		if ((spd_cas_latency >> i) & 1) {
> +			cl = i + base_cl;
> +			break;
> +		}
> +	}
> +
> +	if (max_cas_latency < cl)
> +		cl = max_cas_latency;
> +
> +	if (override_cas_latency != 0)
> +		cl = override_cas_latency;
> +
> +	debug("CAS Latency                                   : %6d\n", cl);
> +
> +	if ((cl * tckmin) > 20000) {
> +		debug("(CLactual * tckmin) = %d exceeds 20 ns\n",
> +		      (cl * tckmin));
> +	}
> +
> +	if (tclk_psecs < (ulong)tckmin) {
> +		printf("WARNING!!!!!!: DDR3 Clock Rate (tCLK: %ld) exceeds DIMM specifications (tckmin:%ld)!!!!!!!!\n",
> +		       tclk_psecs, (ulong)tckmin);
> +	}
> +
> +	if (num_banks != 4 && num_banks != 8 && num_banks != 16) {
> +		printf("Unsupported number of banks %d. Must be 4 or 8.\n",
> +		       num_banks);
> +		++fatal_error;
> +	}
> +
> +	if (num_ranks != 1 && num_ranks != 2 && num_ranks != 4) {
> +		printf("Unsupported number of ranks: %d\n", num_ranks);
> +		++fatal_error;
> +	}
> +
> +	if (octeon_is_cpuid(OCTEON_CN78XX) ||
> +	    octeon_is_cpuid(OCTEON_CN73XX) ||
> +	    octeon_is_cpuid(OCTEON_CNF75XX)) {
> +		if (dram_width != 8 && dram_width != 16 && dram_width != 4) {
> +			printf("Unsupported SDRAM Width, %d.  Must be 4, 8 or 16.\n",
> +			       dram_width);
> +			++fatal_error;
> +		}
> +	} else if (dram_width != 8 && dram_width != 16) {
> +		printf("Unsupported SDRAM Width, %d.  Must be 8 or 16.\n",
> +		       dram_width);
> +		++fatal_error;
> +	}
> +
> +	/*
> +	 ** Bail out here if things are not copasetic.
> +	 */
> +	if (fatal_error)
> +		return (-1);
> +
> +	/*
> +	 * 4.8.4 LMC RESET Initialization
> +	 *
> +	 * The purpose of this step is to assert/deassert the RESET# pin at the
> +	 * DDR3/DDR4 parts.
> +	 *
> +	 * This LMC RESET step is done for all enabled LMCs.
> +	 */
> +	perform_lmc_reset(priv, node, if_num);
> +
> +	// Make sure scrambling is disabled during init...
> +	ctrl.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(if_num));
> +	ctrl.s.scramble_ena = 0;
> +	lmc_wr(priv, CVMX_LMCX_CONTROL(if_num), ctrl.u64);
> +
> +	lmc_wr(priv, CVMX_LMCX_SCRAMBLE_CFG0(if_num), 0);
> +	lmc_wr(priv, CVMX_LMCX_SCRAMBLE_CFG1(if_num), 0);
> +	if (!octeon_is_cpuid(OCTEON_CN78XX_PASS1_X))
> +		lmc_wr(priv, CVMX_LMCX_SCRAMBLE_CFG2(if_num), 0);
> +
> +	odt_idx = min(dimm_count - 1, 3);
> +
> +	switch (num_ranks) {
> +	case 1:
> +		odt_config = odt_1rank_config;
> +		break;
> +	case 2:
> +		odt_config = odt_2rank_config;
> +		break;
> +	case 4:
> +		odt_config = odt_4rank_config;
> +		break;
> +	default:
> +		odt_config = disable_odt_config;
> +		printf("Unsupported number of ranks: %d\n", num_ranks);
> +		++fatal_error;
> +	}
> +
> +	/*
> +	 * 4.8.5 Early LMC Initialization
> +	 *
> +	 * All of DDR PLL, LMC CK, and LMC DRESET initializations must be
> +	 * completed prior to starting this LMC initialization sequence.
> +	 *
> +	 * Perform the following five substeps for early LMC initialization:
> +	 *
> +	 * 1. Software must ensure there are no pending DRAM transactions.
> +	 *
> +	 * 2. Write LMC(0)_CONFIG, LMC(0)_CONTROL, LMC(0)_TIMING_PARAMS0,
> +	 *    LMC(0)_TIMING_PARAMS1, LMC(0)_MODEREG_PARAMS0,
> +	 *    LMC(0)_MODEREG_PARAMS1, LMC(0)_DUAL_MEMCFG, LMC(0)_NXM,
> +	 *    LMC(0)_WODT_MASK, LMC(0)_RODT_MASK, LMC(0)_COMP_CTL2,
> +	 *    LMC(0)_PHY_CTL, LMC(0)_DIMM0/1_PARAMS, and LMC(0)_DIMM_CTL with
> +	 *    appropriate values. All sections in this chapter can be used to
> +	 *    derive proper register settings.
> +	 */
> +
> +	/* LMC(0)_CONFIG */
> +	lmc_config(priv);
> +
> +	/* LMC(0)_CONTROL */
> +	lmc_control(priv);
> +
> +	/* LMC(0)_TIMING_PARAMS0 */
> +	lmc_timing_params0(priv);
> +
> +	/* LMC(0)_TIMING_PARAMS1 */
> +	lmc_timing_params1(priv);
> +
> +	/* LMC(0)_TIMING_PARAMS2 */
> +	lmc_timing_params2(priv);
> +
> +	/* LMC(0)_MODEREG_PARAMS0 */
> +	lmc_modereg_params0(priv);
> +
> +	/* LMC(0)_MODEREG_PARAMS1 */
> +	lmc_modereg_params1(priv);
> +
> +	/* LMC(0)_MODEREG_PARAMS2 */
> +	lmc_modereg_params2(priv);
> +
> +	/* LMC(0)_MODEREG_PARAMS3 */
> +	lmc_modereg_params3(priv);
> +
> +	/* LMC(0)_NXM */
> +	lmc_nxm(priv);
> +
> +	/* LMC(0)_WODT_MASK */
> +	lmc_wodt_mask(priv);
> +
> +	/* LMC(0)_RODT_MASK */
> +	lmc_rodt_mask(priv);
> +
> +	/* LMC(0)_COMP_CTL2 */
> +	lmc_comp_ctl2(priv);
> +
> +	/* LMC(0)_PHY_CTL */
> +	lmc_phy_ctl(priv);
> +
> +	/* LMC(0)_EXT_CONFIG */
> +	lmc_ext_config(priv);
> +
> +	/* LMC(0)_EXT_CONFIG2 */
> +	lmc_ext_config2(priv);
> +
> +	/* LMC(0)_DIMM0/1_PARAMS */
> +	lmc_dimm01_params(priv);
> +
> +	ret = lmc_rank_init(priv);
> +	if (ret < 0)
> +		return 0;	/* 0 indicates problem */
> +
> +	lmc_config_2(priv);
> +
> +	lmc_write_leveling(priv);
> +
> +	lmc_read_leveling(priv);
> +
> +	lmc_workaround(priv);
> +
> +	ret = lmc_sw_write_leveling(priv);
> +	if (ret < 0)
> +		return 0;	/* 0 indicates problem */
> +
> +	// this sometimes causes stack overflow crashes..
> +	// display only for DDR4 RDIMMs.
> +	if (ddr_type == DDR4_DRAM && spd_rdimm) {
> +		int i;
> +
> +		for (i = 0; i < 3; i += 2)	// just pages 0 and 2 for now..
> +			display_mpr_page(priv, rank_mask, if_num, i);
> +	}
> +
> +	lmc_dll(priv);
> +
> +	lmc_workaround_2(priv);
> +
> +	lmc_final(priv);
> +
> +	lmc_scrambling(priv);
> +
> +	return mem_size_mbytes;
> +}
> +
> +/////    HW-assist byte DLL offset tuning   //////
> +
> +static int cvmx_dram_get_num_lmc(struct ddr_priv *priv)
> +{
> +	union cvmx_lmcx_dll_ctl2 lmcx_dll_ctl2;
> +
> +	if (octeon_is_cpuid(OCTEON_CN70XX))
> +		return 1;
> +
> +	if (octeon_is_cpuid(OCTEON_CN73XX) || octeon_is_cpuid(OCTEON_CNF75XX)) {
> +		// sample LMC1
> +		lmcx_dll_ctl2.u64 = lmc_rd(priv, CVMX_LMCX_DLL_CTL2(1));
> +		if (lmcx_dll_ctl2.cn78xx.intf_en)
> +			return 2;
> +		else
> +			return 1;
> +	}
> +
> +	// for CN78XX, LMCs are always active in pairs, and always LMC0/1
> +	// so, we sample LMC2 to see if 2 and 3 are active
> +	lmcx_dll_ctl2.u64 = lmc_rd(priv, CVMX_LMCX_DLL_CTL2(2));
> +	if (lmcx_dll_ctl2.cn78xx.intf_en)
> +		return 4;
> +	else
> +		return 2;
> +}
> +
> +// got to do these here, even though already defined in BDK
> +
> +// all DDR3, and DDR4 x16 today, use only 3 bank bits;
> +// DDR4 x4 and x8 always have 4 bank bits
> +// NOTE: this will change in the future, when DDR4 x16 devices can
> +// come with 16 banks!! FIXME!!
> +static int cvmx_dram_get_num_bank_bits(struct ddr_priv *priv, int lmc)
> +{
> +	union cvmx_lmcx_dll_ctl2 lmcx_dll_ctl2;
> +	union cvmx_lmcx_config lmcx_config;
> +	union cvmx_lmcx_ddr_pll_ctl lmcx_ddr_pll_ctl;
> +	int bank_width;
> +
> +	// can always read this
> +	lmcx_dll_ctl2.u64 = lmc_rd(priv, CVMX_LMCX_DLL_CTL2(lmc));
> +
> +	if (lmcx_dll_ctl2.cn78xx.dreset)	// check LMCn
> +		return 0;
> +
> +	lmcx_config.u64 = lmc_rd(priv, CVMX_LMCX_DLL_CTL2(lmc));
> +	lmcx_ddr_pll_ctl.u64 = lmc_rd(priv, CVMX_LMCX_DDR_PLL_CTL(lmc));
> +
> +	bank_width = ((lmcx_ddr_pll_ctl.s.ddr4_mode != 0) &&
> +		      (lmcx_config.s.bg2_enable)) ? 4 : 3;
> +
> +	return bank_width;
> +}
> +
> +#define EXTRACT(v, lsb, width) (((v) >> (lsb)) & ((1ull << (width)) - 1))
> +#define ADDRESS_HOLE 0x10000000ULL
> +
> +static void cvmx_dram_address_extract_info(struct ddr_priv *priv, u64 address,
> +					   int *node, int *lmc, int *dimm,
> +					   int *prank, int *lrank, int *bank,
> +					   int *row, int *col)
> +{
> +	int bank_lsb, xbits;
> +	union cvmx_l2c_ctl l2c_ctl;
> +	union cvmx_lmcx_config lmcx_config;
> +	union cvmx_lmcx_control lmcx_control;
> +	union cvmx_lmcx_ext_config ext_config;
> +	int bitno = (octeon_is_cpuid(OCTEON_CN7XXX)) ? 20 : 18;
> +	int bank_width;
> +	int dimm_lsb;
> +	int dimm_width;
> +	int prank_lsb, lrank_lsb;
> +	int prank_width, lrank_width;
> +	int row_lsb;
> +	int row_width;
> +	int col_hi_lsb;
> +	int col_hi_width;
> +	int col_hi;
> +
> +	if (octeon_is_cpuid(OCTEON_CN73XX) || octeon_is_cpuid(OCTEON_CNF75XX))
> +		bitno = 18;
> +
> +	*node = EXTRACT(address, 40, 2);	/* Address bits [41:40] */
> +
> +	address &= (1ULL << 40) - 1;	// lop off any node bits or above
> +	if (address >= ADDRESS_HOLE)	// adjust down if at HOLE or above
> +		address -= ADDRESS_HOLE;
> +
> +	/* Determine the LMC controllers */
> +	l2c_ctl.u64 = l2c_rd(priv, CVMX_L2C_CTL);
> +
> +	/* xbits depends on number of LMCs */
> +	xbits = cvmx_dram_get_num_lmc(priv) >> 1;	// 4->2, 2->1, 1->0
> +	bank_lsb = 7 + xbits;
> +
> +	/* LMC number is probably aliased */
> +	if (l2c_ctl.s.disidxalias) {
> +		*lmc = EXTRACT(address, 7, xbits);
> +	}  else {
> +		*lmc = EXTRACT(address, 7, xbits) ^
> +			EXTRACT(address, bitno, xbits) ^
> +			EXTRACT(address, 12, xbits);
> +	}
> +
> +	/* Figure out the bank field width */
> +	lmcx_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(*lmc));
> +	ext_config.u64 = lmc_rd(priv, CVMX_LMCX_EXT_CONFIG(*lmc));
> +	bank_width = cvmx_dram_get_num_bank_bits(priv, *lmc);
> +
> +	/* Extract additional info from the LMC_CONFIG CSR */
> +	dimm_lsb = 28 + lmcx_config.s.pbank_lsb + xbits;
> +	dimm_width = 40 - dimm_lsb;
> +	prank_lsb = dimm_lsb - lmcx_config.s.rank_ena;
> +	prank_width = dimm_lsb - prank_lsb;
> +	lrank_lsb = prank_lsb - ext_config.s.dimm0_cid;
> +	lrank_width = prank_lsb - lrank_lsb;
> +	row_lsb = 14 + lmcx_config.s.row_lsb + xbits;
> +	row_width = lrank_lsb - row_lsb;
> +	col_hi_lsb = bank_lsb + bank_width;
> +	col_hi_width = row_lsb - col_hi_lsb;
> +
> +	/* Extract the parts of the address */
> +	*dimm = EXTRACT(address, dimm_lsb, dimm_width);
> +	*prank = EXTRACT(address, prank_lsb, prank_width);
> +	*lrank = EXTRACT(address, lrank_lsb, lrank_width);
> +	*row = EXTRACT(address, row_lsb, row_width);
> +
> +	/* bank calculation may be aliased... */
> +	lmcx_control.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(*lmc));
> +	if (lmcx_control.s.xor_bank) {
> +		*bank = EXTRACT(address, bank_lsb, bank_width) ^
> +			EXTRACT(address, 12 + xbits, bank_width);
> +	} else {
> +		*bank = EXTRACT(address, bank_lsb, bank_width);
> +	}
> +
> +	/* LMC number already extracted */
> +	col_hi = EXTRACT(address, col_hi_lsb, col_hi_width);
> +	*col = EXTRACT(address, 3, 4) | (col_hi << 4);
> +	/* Bus byte is address bits [2:0]. Unused here */
> +}
> +
> +// end of added workarounds
> +
> +// NOTE: "mode" argument:
> +//         DBTRAIN_TEST: for testing using GP patterns, includes ECC
> +//         DBTRAIN_DBI:  for DBI deskew training behavior (uses GP patterns)
> +//         DBTRAIN_LFSR: for testing using LFSR patterns, includes ECC
> +// NOTE: trust the caller to specify the correct/supported mode
> +//
> +static int test_dram_byte_hw(struct ddr_priv *priv, int if_num, u64 p,
> +			     int mode, u64 *xor_data)
> +{
> +	u64 p1;
> +	u64 k;
> +	int errors = 0;
> +
> +	u64 mpr_data0, mpr_data1;
> +	u64 bad_bits[2] = { 0, 0 };
> +
> +	int node_address, lmc, dimm;
> +	int prank, lrank;
> +	int bank, row, col;
> +	int save_or_dis;
> +	int byte;
> +	int ba_loop, ba_bits;
> +
> +	union cvmx_lmcx_rlevel_ctl rlevel_ctl;
> +	union cvmx_lmcx_dbtrain_ctl dbtrain_ctl;
> +	union cvmx_lmcx_phy_ctl phy_ctl;
> +
> +	int biter_errs;
> +
> +	// FIXME: K iterations set to 4 for now.
> +	// FIXME: decrement to increase interations.
> +	// FIXME: must be no less than 22 to stay above an LMC hash field.
> +	int kshift = 27;
> +
> +	const char *s;
> +	int node = 0;
> +
> +	// allow override default setting for kshift
> +	s = env_get("ddr_tune_set_kshift");
> +	if (s) {
> +		int temp = simple_strtoul(s, NULL, 0);
> +
> +		if (temp < 22 || temp > 28) {
> +			debug("N%d.LMC%d: ILLEGAL override of kshift to %d, using default %d\n",
> +			      node, if_num, temp, kshift);
> +		} else {
> +			debug("N%d.LMC%d: overriding kshift (%d) to %d\n",
> +			      node, if_num, kshift, temp);
> +			kshift = temp;
> +		}
> +	}
> +
> +	/*
> +	 * 1) Make sure that RLEVEL_CTL[OR_DIS] = 0.
> +	 */
> +	rlevel_ctl.u64 = lmc_rd(priv, CVMX_LMCX_RLEVEL_CTL(if_num));
> +	save_or_dis = rlevel_ctl.s.or_dis;
> +	/* or_dis must be disabled for this sequence */
> +	rlevel_ctl.s.or_dis = 0;
> +	lmc_wr(priv, CVMX_LMCX_RLEVEL_CTL(if_num), rlevel_ctl.u64);
> +
> +	/*
> +	 * NOTE: this step done in the calling routine(s)...
> +	 * 3) Setup GENERAL_PURPOSE[0-2] registers with the data pattern
> +	 * of choice.
> +	 * a. GENERAL_PURPOSE0[DATA<63:0>] – sets the initial lower
> +	 * (rising edge) 64 bits of data.
> +	 * b. GENERAL_PURPOSE1[DATA<63:0>] – sets the initial upper
> +	 * (falling edge) 64 bits of data.
> +	 * c. GENERAL_PURPOSE2[DATA<15:0>] – sets the initial lower
> +	 * (rising edge <7:0>) and upper (falling edge <15:8>) ECC data.
> +	 */
> +
> +	// final address must include LMC and node
> +	p |= (if_num << 7);	/* Map address into proper interface */
> +	p |= (u64)node << CVMX_NODE_MEM_SHIFT;	// map to node
> +
> +	/*
> +	 * Add base offset to both test regions to not clobber u-boot stuff
> +	 * when running from L2 for NAND boot.
> +	 */
> +	p += 0x20000000;	// offset to 512MB, ie above THE HOLE!!!
> +	p |= 1ull << 63;	// needed for OCTEON
> +
> +	errors = 0;
> +
> +	cvmx_dram_address_extract_info(priv, p, &node_address, &lmc, &dimm,
> +				       &prank, &lrank, &bank, &row, &col);
> +	debug("%s: START at A:0x%012llx, N%d L%d D%d/%d R%d B%1x Row:%05x Col:%05x\n",
> +	      __func__, p, node_address, lmc, dimm, prank, lrank, bank,
> +	      row, col);
> +
> +	// only check once per call, and ignore if no match...
> +	if ((int)node != node_address) {
> +		printf("ERROR: Node address mismatch\n");
> +		return 0;
> +	}
> +	if (lmc != if_num) {
> +		printf("ERROR: LMC address mismatch\n");
> +		return 0;
> +	}
> +
> +	/*
> +	 * 7) Set PHY_CTL[PHY_RESET] = 1 (LMC automatically clears this as
> +	 * it’s a one-shot operation). This is to get into the habit of
> +	 * resetting PHY’s SILO to the original 0 location.
> +	 */
> +	phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +	phy_ctl.s.phy_reset = 1;
> +	lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +
> +	/*
> +	 * Walk through a range of addresses avoiding bits that alias
> +	 * interfaces on the CN88XX.
> +	 */
> +
> +	// FIXME: want to try to keep the K increment from affecting the
> +	// LMC via hash, so keep it above bit 21 we also want to keep k
> +	// less than the base offset of bit 29 (512MB)
> +
> +	for (k = 0; k < (1UL << 29); k += (1UL << kshift)) {
> +		// FIXME: the sequence will interate over 1/2 cacheline
> +		// FIXME: for each unit specified in "read_cmd_count",
> +		// FIXME: so, we setup each sequence to do the max cachelines
> +		// it can
> +
> +		p1 = p + k;
> +
> +		cvmx_dram_address_extract_info(priv, p1, &node_address, &lmc,
> +					       &dimm, &prank, &lrank, &bank,
> +					       &row, &col);
> +
> +		/*
> +		 * 2) Setup the fields of the CSR DBTRAIN_CTL as follows:
> +		 * a. COL, ROW, BA, BG, PRANK points to the starting point
> +		 * of the address.
> +		 * You can just set them to all 0.
> +		 * b. RW_TRAIN – set this to 1.
> +		 * c. TCCD_L – set this to 0.
> +		 * d. READ_CMD_COUNT – instruct the sequence to the how many
> +		 * writes/reads.
> +		 * It is 5 bits field, so set to 31 of maximum # of r/w.
> +		 */
> +		dbtrain_ctl.u64 = lmc_rd(priv, CVMX_LMCX_DBTRAIN_CTL(if_num));
> +		dbtrain_ctl.s.column_a = col;
> +		dbtrain_ctl.s.row_a = row;
> +		dbtrain_ctl.s.bg = (bank >> 2) & 3;
> +		dbtrain_ctl.s.prank = (dimm * 2) + prank;	// FIXME?
> +		dbtrain_ctl.s.lrank = lrank;	// FIXME?
> +		dbtrain_ctl.s.activate = (mode == DBTRAIN_DBI);
> +		dbtrain_ctl.s.write_ena = 1;
> +		dbtrain_ctl.s.read_cmd_count = 31;	// max count pass 1.x
> +		if (octeon_is_cpuid(OCTEON_CN78XX_PASS2_X) ||
> +		    octeon_is_cpuid(OCTEON_CNF75XX)) {
> +			// max count on chips that support it
> +			dbtrain_ctl.s.cmd_count_ext = 3;
> +		} else {
> +			// max count pass 1.x
> +			dbtrain_ctl.s.cmd_count_ext = 0;
> +		}
> +
> +		dbtrain_ctl.s.rw_train = 1;
> +		dbtrain_ctl.s.tccd_sel = (mode == DBTRAIN_DBI);
> +		// LFSR should only be on when chip supports it...
> +		dbtrain_ctl.s.lfsr_pattern_sel = (mode == DBTRAIN_LFSR) ? 1 : 0;
> +
> +		biter_errs = 0;
> +
> +		// for each address, iterate over the 4 "banks" in the BA
> +		for (ba_loop = 0, ba_bits = bank & 3;
> +		     ba_loop < 4; ba_loop++, ba_bits = (ba_bits + 1) & 3) {
> +			dbtrain_ctl.s.ba = ba_bits;
> +			lmc_wr(priv, CVMX_LMCX_DBTRAIN_CTL(if_num),
> +			       dbtrain_ctl.u64);
> +
> +			/*
> +			 * We will use the RW_TRAINING sequence (14) for
> +			 * this task.
> +			 *
> +			 * 4) Kick off the sequence (SEQ_CTL[SEQ_SEL] = 14,
> +			 *    SEQ_CTL[INIT_START] = 1).
> +			 * 5) Poll on SEQ_CTL[SEQ_COMPLETE] for completion.
> +			 */
> +			oct3_ddr3_seq(priv, prank, if_num, 14);
> +
> +			/*
> +			 * 6) Read MPR_DATA0 and MPR_DATA1 for results.
> +			 * a. MPR_DATA0[MPR_DATA<63:0>] – comparison results
> +			 *    for DQ63:DQ0. (1 means MATCH, 0 means FAIL).
> +			 * b. MPR_DATA1[MPR_DATA<7:0>] – comparison results
> +			 *    for ECC bit7:0.
> +			 */
> +			mpr_data0 = lmc_rd(priv, CVMX_LMCX_MPR_DATA0(if_num));
> +			mpr_data1 = lmc_rd(priv, CVMX_LMCX_MPR_DATA1(if_num));
> +
> +			/*
> +			 * 7) Set PHY_CTL[PHY_RESET] = 1 (LMC automatically
> +			 * clears this as it’s a one-shot operation).
> +			 * This is to get into the habit of resetting PHY’s
> +			 * SILO to the original 0 location.
> +			 */
> +			phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(if_num));
> +			phy_ctl.s.phy_reset = 1;
> +			lmc_wr(priv, CVMX_LMCX_PHY_CTL(if_num), phy_ctl.u64);
> +
> +			// bypass any error checking or updating when DBI mode
> +			if (mode == DBTRAIN_DBI)
> +				continue;
> +
> +			// data bytes
> +			if (~mpr_data0) {
> +				for (byte = 0; byte < 8; byte++) {
> +					if ((~mpr_data0 >> (8 * byte)) & 0xffUL)
> +						biter_errs |= (1 << byte);
> +				}
> +				// accumulate bad bits
> +				bad_bits[0] |= ~mpr_data0;
> +			}
> +
> +			// include ECC byte errors
> +			if (~mpr_data1 & 0xffUL) {
> +				biter_errs |= (1 << 8);
> +				bad_bits[1] |= ~mpr_data1 & 0xffUL;
> +			}
> +		}
> +
> +		errors |= biter_errs;
> +	}			/* end for (k=...) */
> +
> +	rlevel_ctl.s.or_dis = save_or_dis;
> +	lmc_wr(priv, CVMX_LMCX_RLEVEL_CTL(if_num), rlevel_ctl.u64);
> +
> +	// send the bad bits back...
> +	if (mode != DBTRAIN_DBI && xor_data) {
> +		xor_data[0] = bad_bits[0];
> +		xor_data[1] = bad_bits[1];
> +	}
> +
> +	return errors;
> +}
> +
> +// setup default for byte test pattern array
> +// take these from the HRM section 6.9.13
> +static const u64 byte_pattern_0[] = {
> +	0xFFAAFFFFFF55FFFFULL,	// GP0
> +	0x55555555AAAAAAAAULL,	// GP1
> +	0xAA55AAAAULL,		// GP2
> +};
> +
> +static const u64 byte_pattern_1[] = {
> +	0xFBF7EFDFBF7FFEFDULL,	// GP0
> +	0x0F1E3C78F0E1C387ULL,	// GP1
> +	0xF0E1BF7FULL,		// GP2
> +};
> +
> +// this is from Andrew via LFSR with PRBS=0xFFFFAAAA
> +static const u64 byte_pattern_2[] = {
> +	0xEE55AADDEE55AADDULL,	// GP0
> +	0x55AADDEE55AADDEEULL,	// GP1
> +	0x55EEULL,		// GP2
> +};
> +
> +// this is from Mike via LFSR with PRBS=0x4A519909
> +static const u64 byte_pattern_3[] = {
> +	0x0088CCEE0088CCEEULL,	// GP0
> +	0xBB552211BB552211ULL,	// GP1
> +	0xBB00ULL,		// GP2
> +};
> +
> +static const u64 *byte_patterns[4] = {
> +	byte_pattern_0, byte_pattern_1, byte_pattern_2, byte_pattern_3
> +};
> +
> +static const u32 lfsr_patterns[4] = {
> +	0xFFFFAAAAUL, 0x06000000UL, 0xAAAAFFFFUL, 0x4A519909UL
> +};
> +
> +#define NUM_BYTE_PATTERNS 4
> +
> +#define DEFAULT_BYTE_BURSTS 32	// compromise between time and rigor
> +
> +static void setup_hw_pattern(struct ddr_priv *priv, int lmc,
> +			     const u64 *pattern_p)
> +{
> +	/*
> +	 * 3) Setup GENERAL_PURPOSE[0-2] registers with the data pattern
> +	 * of choice.
> +	 * a. GENERAL_PURPOSE0[DATA<63:0>] – sets the initial lower
> +	 *    (rising edge) 64 bits of data.
> +	 * b. GENERAL_PURPOSE1[DATA<63:0>] – sets the initial upper
> +	 *    (falling edge) 64 bits of data.
> +	 * c. GENERAL_PURPOSE2[DATA<15:0>] – sets the initial lower
> +	 *    (rising edge <7:0>) and upper
> +	 * (falling edge <15:8>) ECC data.
> +	 */
> +	lmc_wr(priv, CVMX_LMCX_GENERAL_PURPOSE0(lmc), pattern_p[0]);
> +	lmc_wr(priv, CVMX_LMCX_GENERAL_PURPOSE1(lmc), pattern_p[1]);
> +	lmc_wr(priv, CVMX_LMCX_GENERAL_PURPOSE2(lmc), pattern_p[2]);
> +}
> +
> +static void setup_lfsr_pattern(struct ddr_priv *priv, int lmc, u32 data)
> +{
> +	union cvmx_lmcx_char_ctl char_ctl;
> +	u32 prbs;
> +	const char *s;
> +
> +	s = env_get("ddr_lfsr_prbs");
> +	if (s)
> +		prbs = simple_strtoul(s, NULL, 0);
> +	else
> +		prbs = data;
> +
> +	/*
> +	 * 2) DBTRAIN_CTL[LFSR_PATTERN_SEL] = 1
> +	 * here data comes from the LFSR generating a PRBS pattern
> +	 * CHAR_CTL.EN = 0
> +	 * CHAR_CTL.SEL = 0; // for PRBS
> +	 * CHAR_CTL.DR = 1;
> +	 * CHAR_CTL.PRBS = setup for whatever type of PRBS to send
> +	 * CHAR_CTL.SKEW_ON = 1;
> +	 */
> +	char_ctl.u64 = lmc_rd(priv, CVMX_LMCX_CHAR_CTL(lmc));
> +	char_ctl.s.en = 0;
> +	char_ctl.s.sel = 0;
> +	char_ctl.s.dr = 1;
> +	char_ctl.s.prbs = prbs;
> +	char_ctl.s.skew_on = 1;
> +	lmc_wr(priv, CVMX_LMCX_CHAR_CTL(lmc), char_ctl.u64);
> +}
> +
> +static int choose_best_hw_patterns(int lmc, int mode)
> +{
> +	int new_mode = mode;
> +	const char *s;
> +
> +	switch (mode) {
> +	case DBTRAIN_TEST:	// always choose LFSR if chip supports it
> +		if (octeon_is_cpuid(OCTEON_CN78XX_PASS2_X)) {
> +			int lfsr_enable = 1;
> +
> +			s = env_get("ddr_allow_lfsr");
> +			if (s) {
> +				// override?
> +				lfsr_enable = !!strtoul(s, NULL, 0);
> +			}
> +
> +			if (lfsr_enable)
> +				new_mode = DBTRAIN_LFSR;
> +		}
> +		break;
> +
> +	case DBTRAIN_DBI:	// possibly can allow LFSR use?
> +		break;
> +
> +	case DBTRAIN_LFSR:	// forced already
> +		if (!octeon_is_cpuid(OCTEON_CN78XX_PASS2_X)) {
> +			debug("ERROR: illegal HW assist mode %d\n", mode);
> +			new_mode = DBTRAIN_TEST;
> +		}
> +		break;
> +
> +	default:
> +		debug("ERROR: unknown HW assist mode %d\n", mode);
> +	}
> +
> +	if (new_mode != mode)
> +		debug("%s: changing mode %d to %d\n", __func__, mode, new_mode);
> +
> +	return new_mode;
> +}
> +
> +int run_best_hw_patterns(struct ddr_priv *priv, int lmc, u64 phys_addr,
> +			 int mode, u64 *xor_data)
> +{
> +	int pattern;
> +	const u64 *pattern_p;
> +	int errs, errors = 0;
> +
> +	// FIXME? always choose LFSR if chip supports it???
> +	mode = choose_best_hw_patterns(lmc, mode);
> +
> +	for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) {
> +		if (mode == DBTRAIN_LFSR) {
> +			setup_lfsr_pattern(priv, lmc, lfsr_patterns[pattern]);
> +		} else {
> +			pattern_p = byte_patterns[pattern];
> +			setup_hw_pattern(priv, lmc, pattern_p);
> +		}
> +		errs = test_dram_byte_hw(priv, lmc, phys_addr, mode, xor_data);
> +
> +		debug("%s: PATTERN %d at A:0x%012llx errors 0x%x\n",
> +		      __func__, pattern, phys_addr, errs);
> +
> +		errors |= errs;
> +	}
> +
> +	return errors;
> +}
> +
> +static void hw_assist_test_dll_offset(struct ddr_priv *priv,
> +				      int dll_offset_mode, int lmc,
> +				      int bytelane,
> +				      int if_64b,
> +				      u64 dram_tune_rank_offset,
> +				      int dram_tune_byte_bursts)
> +{
> +	int byte_offset, new_best_offset[9];
> +	int rank_delay_start[4][9];
> +	int rank_delay_count[4][9];
> +	int rank_delay_best_start[4][9];
> +	int rank_delay_best_count[4][9];
> +	int errors[4], off_errors, tot_errors;
> +	int rank_mask, rankx, active_ranks;
> +	int pattern;
> +	const u64 *pattern_p;
> +	int byte;
> +	char *mode_str = (dll_offset_mode == 2) ? "Read" : "Write";
> +	int pat_best_offset[9];
> +	u64 phys_addr;
> +	int pat_beg, pat_end;
> +	int rank_beg, rank_end;
> +	int byte_lo, byte_hi;
> +	union cvmx_lmcx_config lmcx_config;
> +	u64 hw_rank_offset;
> +	int num_lmcs = cvmx_dram_get_num_lmc(priv);
> +	// FIXME? always choose LFSR if chip supports it???
> +	int mode = choose_best_hw_patterns(lmc, DBTRAIN_TEST);
> +	int node = 0;
> +
> +	if (bytelane == 0x0A) {	// all bytelanes
> +		byte_lo = 0;
> +		byte_hi = 8;
> +	} else {		// just 1
> +		byte_lo = bytelane;
> +		byte_hi = bytelane;
> +	}
> +
> +	lmcx_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(lmc));
> +	rank_mask = lmcx_config.s.init_status;
> +
> +	// this should be correct for 1 or 2 ranks, 1 or 2 DIMMs
> +	hw_rank_offset =
> +	    1ull << (28 + lmcx_config.s.pbank_lsb - lmcx_config.s.rank_ena +
> +		     (num_lmcs / 2));
> +
> +	debug("N%d: %s: starting LMC%d with rank offset 0x%016llx\n",
> +	      node, __func__, lmc, (unsigned long long)hw_rank_offset);
> +
> +	// start of pattern loop
> +	// we do the set of tests for each pattern supplied...
> +
> +	memset(new_best_offset, 0, sizeof(new_best_offset));
> +	for (pattern = 0; pattern < NUM_BYTE_PATTERNS; pattern++) {
> +		memset(pat_best_offset, 0, sizeof(pat_best_offset));
> +
> +		if (mode == DBTRAIN_TEST) {
> +			pattern_p = byte_patterns[pattern];
> +			setup_hw_pattern(priv, lmc, pattern_p);
> +		} else {
> +			setup_lfsr_pattern(priv, lmc, lfsr_patterns[pattern]);
> +		}
> +
> +		// now loop through all legal values for the DLL byte offset...
> +
> +#define BYTE_OFFSET_INCR 3	// FIXME: make this tunable?
> +
> +		tot_errors = 0;
> +
> +		memset(rank_delay_count, 0, sizeof(rank_delay_count));
> +		memset(rank_delay_start, 0, sizeof(rank_delay_start));
> +		memset(rank_delay_best_count, 0, sizeof(rank_delay_best_count));
> +		memset(rank_delay_best_start, 0, sizeof(rank_delay_best_start));
> +
> +		for (byte_offset = -63; byte_offset < 64;
> +		     byte_offset += BYTE_OFFSET_INCR) {
> +			// do the setup on the active LMC
> +			// set the bytelanes DLL offsets
> +			change_dll_offset_enable(priv, lmc, 0);
> +			// FIXME? bytelane?
> +			load_dll_offset(priv, lmc, dll_offset_mode,
> +					byte_offset, bytelane);
> +			change_dll_offset_enable(priv, lmc, 1);
> +
> +			//bdk_watchdog_poke();
> +
> +			// run the test on each rank
> +			// only 1 call per rank should be enough, let the
> +			// bursts, loops, etc, control the load...
> +
> +			// errors for this byte_offset, all ranks
> +			off_errors = 0;
> +
> +			active_ranks = 0;
> +
> +			for (rankx = 0; rankx < 4; rankx++) {
> +				if (!(rank_mask & (1 << rankx)))
> +					continue;
> +
> +				phys_addr = hw_rank_offset * active_ranks;
> +				// FIXME: now done by test_dram_byte_hw()
> +				//phys_addr |= (lmc << 7);
> +				//phys_addr |= (u64)node << CVMX_NODE_MEM_SHIFT;
> +
> +				active_ranks++;
> +
> +				// NOTE: return is a now a bitmask of the
> +				// erroring bytelanes.
> +				errors[rankx] =
> +				    test_dram_byte_hw(priv, lmc, phys_addr,
> +						      mode, NULL);
> +
> +				// process any errors in the bytelane(s) that
> +				// are being tested
> +				for (byte = byte_lo; byte <= byte_hi; byte++) {
> +					// check errors
> +					// yes, an error in the byte lane in
> +					// this rank
> +					if (errors[rankx] & (1 << byte)) {
> +						off_errors |= (1 << byte);
> +
> +						debug("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: Address 0x%012llx errors\n",
> +						      node, lmc, rankx, byte,
> +						      mode_str, byte_offset,
> +						      phys_addr);
> +
> +						// had started run
> +						if (rank_delay_count
> +						    [rankx][byte] > 0) {
> +							debug("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: stopping a run here\n",
> +							      node, lmc, rankx,
> +							      byte, mode_str,
> +							      byte_offset);
> +							// stop now
> +							rank_delay_count
> +								[rankx][byte] =
> +								0;
> +						}
> +						// FIXME: else had not started
> +						// run - nothing else to do?
> +					} else {
> +						// no error in the byte lane
> +						// first success, set run start
> +						if (rank_delay_count[rankx]
> +						    [byte] == 0) {
> +							debug("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: starting a run here\n",
> +							      node, lmc, rankx,
> +							      byte, mode_str,
> +							      byte_offset);
> +							rank_delay_start[rankx]
> +								[byte] =
> +								byte_offset;
> +						}
> +						// bump run length
> +						rank_delay_count[rankx][byte]
> +							+= BYTE_OFFSET_INCR;
> +
> +						// is this now the biggest
> +						// window?
> +						if (rank_delay_count[rankx]
> +						    [byte] >
> +						    rank_delay_best_count[rankx]
> +						    [byte]) {
> +							rank_delay_best_count
> +							    [rankx][byte] =
> +							    rank_delay_count
> +							    [rankx][byte];
> +							rank_delay_best_start
> +							    [rankx][byte] =
> +							    rank_delay_start
> +							    [rankx][byte];
> +							debug("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test %3d: updating best to %d/%d\n",
> +							      node, lmc, rankx,
> +							      byte, mode_str,
> +							      byte_offset,
> +							      rank_delay_best_start
> +							      [rankx][byte],
> +							      rank_delay_best_count
> +							      [rankx][byte]);
> +						}
> +					}
> +				}
> +			} /* for (rankx = 0; rankx < 4; rankx++) */
> +
> +			tot_errors |= off_errors;
> +		}
> +
> +		// set the bytelanes DLL offsets all back to 0
> +		change_dll_offset_enable(priv, lmc, 0);
> +		load_dll_offset(priv, lmc, dll_offset_mode, 0, bytelane);
> +		change_dll_offset_enable(priv, lmc, 1);
> +
> +		// now choose the best byte_offsets for this pattern
> +		// according to the best windows of the tested ranks
> +		// calculate offset by constructing an average window
> +		// from the rank windows
> +		for (byte = byte_lo; byte <= byte_hi; byte++) {
> +			pat_beg = -999;
> +			pat_end = 999;
> +
> +			for (rankx = 0; rankx < 4; rankx++) {
> +				if (!(rank_mask & (1 << rankx)))
> +					continue;
> +
> +				rank_beg = rank_delay_best_start[rankx][byte];
> +				pat_beg = max(pat_beg, rank_beg);
> +				rank_end = rank_beg +
> +					rank_delay_best_count[rankx][byte] -
> +					BYTE_OFFSET_INCR;
> +				pat_end = min(pat_end, rank_end);
> +
> +				debug("N%d.LMC%d.R%d: Bytelane %d DLL %s Offset Test:  Rank Window %3d:%3d\n",
> +				      node, lmc, rankx, byte, mode_str,
> +				      rank_beg, rank_end);
> +
> +			}	/* for (rankx = 0; rankx < 4; rankx++) */
> +
> +			pat_best_offset[byte] = (pat_end + pat_beg) / 2;
> +
> +			// sum the pattern averages
> +			new_best_offset[byte] += pat_best_offset[byte];
> +		}
> +
> +		// now print them on 1 line, descending order...
> +		debug("N%d.LMC%d: HW DLL %s Offset Pattern %d :",
> +		      node, lmc, mode_str, pattern);
> +		for (byte = byte_hi; byte >= byte_lo; --byte)
> +			debug(" %4d", pat_best_offset[byte]);
> +		debug("\n");
> +	}
> +	// end of pattern loop
> +
> +	debug("N%d.LMC%d: HW DLL %s Offset Average  : ", node, lmc, mode_str);
> +
> +	// print in decending byte index order
> +	for (byte = byte_hi; byte >= byte_lo; --byte) {
> +		// create the new average NINT
> +		new_best_offset[byte] = divide_nint(new_best_offset[byte],
> +						    NUM_BYTE_PATTERNS);
> +
> +		// print the best offsets from all patterns
> +
> +		// print just the offset of all the bytes
> +		if (bytelane == 0x0A)
> +			debug("%4d ", new_best_offset[byte]);
> +		else		// print the bytelanes also
> +			debug("(byte %d) %4d ", byte, new_best_offset[byte]);
> +
> +		// done with testing, load up the best offsets we found...
> +		// disable offsets while we load...
> +		change_dll_offset_enable(priv, lmc, 0);
> +		load_dll_offset(priv, lmc, dll_offset_mode,
> +				new_best_offset[byte], byte);
> +		// re-enable the offsets now that we are done loading
> +		change_dll_offset_enable(priv, lmc, 1);
> +	}
> +
> +	debug("\n");
> +}
> +
> +/*
> + * Automatically adjust the DLL offset for the selected bytelane using
> + * hardware-assist
> + */
> +static int perform_HW_dll_offset_tuning(struct ddr_priv *priv,
> +					int dll_offset_mode, int bytelane)
> +{
> +	int if_64b;
> +	int save_ecc_ena[4];
> +	union cvmx_lmcx_config lmc_config;
> +	int lmc, num_lmcs = cvmx_dram_get_num_lmc(priv);
> +	const char *s;
> +	int loops = 1, loop;
> +	int by;
> +	u64 dram_tune_rank_offset;
> +	int dram_tune_byte_bursts = DEFAULT_BYTE_BURSTS;
> +	int node = 0;
> +
> +	// see if we want to do the tuning more than once per LMC...
> +	s = env_get("ddr_tune_ecc_loops");
> +	if (s)
> +		loops = strtoul(s, NULL, 0);
> +
> +	// allow override of the test repeats (bursts)
> +	s = env_get("ddr_tune_byte_bursts");
> +	if (s)
> +		dram_tune_byte_bursts = strtoul(s, NULL, 10);
> +
> +	// print current working values
> +	debug("N%d: H/W Tuning for bytelane %d will use %d loops, %d bursts, and %d patterns.\n",
> +	      node, bytelane, loops, dram_tune_byte_bursts, NUM_BYTE_PATTERNS);
> +
> +	// FIXME? get flag from LMC0 only
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(0));
> +	if_64b = !lmc_config.s.mode32b;
> +
> +	// this should be correct for 1 or 2 ranks, 1 or 2 DIMMs
> +	dram_tune_rank_offset =
> +	    1ull << (28 + lmc_config.s.pbank_lsb - lmc_config.s.rank_ena +
> +		     (num_lmcs / 2));
> +
> +	// do once for each active LMC
> +
> +	for (lmc = 0; lmc < num_lmcs; lmc++) {
> +		debug("N%d: H/W Tuning: starting LMC%d bytelane %d tune.\n",
> +		      node, lmc, bytelane);
> +
> +		/* Enable ECC for the HW tests */
> +		// NOTE: we do enable ECC, but the HW tests used will not
> +		// generate "visible" errors
> +		lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(lmc));
> +		save_ecc_ena[lmc] = lmc_config.s.ecc_ena;
> +		lmc_config.s.ecc_ena = 1;
> +		lmc_wr(priv, CVMX_LMCX_CONFIG(lmc), lmc_config.u64);
> +		lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(lmc));
> +
> +		// testing is done on a single LMC at a time
> +		// FIXME: for now, loop here to show what happens multiple times
> +		for (loop = 0; loop < loops; loop++) {
> +			/* Perform DLL offset tuning */
> +			hw_assist_test_dll_offset(priv, 2 /* 2=read */, lmc,
> +						  bytelane,
> +						  if_64b, dram_tune_rank_offset,
> +						  dram_tune_byte_bursts);
> +		}
> +
> +		// perform cleanup on active LMC
> +		debug("N%d: H/W Tuning: finishing LMC%d bytelane %d tune.\n",
> +		      node, lmc, bytelane);
> +
> +		/* Restore ECC for DRAM tests */
> +		lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(lmc));
> +		lmc_config.s.ecc_ena = save_ecc_ena[lmc];
> +		lmc_wr(priv, CVMX_LMCX_CONFIG(lmc), lmc_config.u64);
> +		lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(lmc));
> +
> +		// finally, see if there are any read offset overrides
> +		// after tuning
> +		for (by = 0; by < 9; by++) {
> +			s = lookup_env(priv, "ddr%d_tune_byte%d", lmc, by);
> +			if (s) {
> +				int dllro = strtoul(s, NULL, 10);
> +
> +				change_dll_offset_enable(priv, lmc, 0);
> +				load_dll_offset(priv, lmc, 2, dllro, by);
> +				change_dll_offset_enable(priv, lmc, 1);
> +			}
> +		}
> +
> +	}			/* for (lmc = 0; lmc < num_lmcs; lmc++) */
> +
> +	// finish up...
> +
> +	return 0;
> +
> +}				/* perform_HW_dll_offset_tuning */
> +
> +// this routine simply makes the calls to the tuning routine and returns
> +// any errors
> +static int cvmx_tune_node(struct ddr_priv *priv)
> +{
> +	int errs, tot_errs;
> +	int do_dllwo = 0;	// default to NO
> +	const char *str;
> +	int node = 0;
> +
> +	// Automatically tune the data and ECC byte DLL read offsets
> +	debug("N%d: Starting DLL Read Offset Tuning for LMCs\n", node);
> +	errs = perform_HW_dll_offset_tuning(priv, 2, 0x0A /* all bytelanes */);
> +	debug("N%d: Finished DLL Read Offset Tuning for LMCs, %d errors\n",
> +	      node, errs);
> +	tot_errs = errs;
> +
> +	// disabled by default for now, does not seem to be needed?
> +	// Automatically tune the data and ECC byte DLL write offsets
> +	// allow override of default setting
> +	str = env_get("ddr_tune_write_offsets");
> +	if (str)
> +		do_dllwo = !!strtoul(str, NULL, 0);
> +	if (do_dllwo) {
> +		debug("N%d: Starting DLL Write Offset Tuning for LMCs\n", node);
> +		errs =
> +		    perform_HW_dll_offset_tuning(priv, 1,
> +						 0x0A /* all bytelanes */);
> +		debug("N%d: Finished DLL Write Offset Tuning for LMCs, %d errors\n",
> +		      node, errs);
> +		tot_errs += errs;
> +	}
> +
> +	return tot_errs;
> +}
> +
> +// this routine makes the calls to the tuning routines when criteria are met
> +// intended to be called for automated tuning, to apply filtering...
> +
> +#define IS_DDR4  1
> +#define IS_DDR3  0
> +#define IS_RDIMM 1
> +#define IS_UDIMM 0
> +#define IS_1SLOT 1
> +#define IS_2SLOT 0
> +
> +// FIXME: DDR3 is not tuned
> +static const u32 ddr_speed_filter[2][2][2] = {
> +	[IS_DDR4] = {
> +		     [IS_RDIMM] = {
> +				   [IS_1SLOT] = 940,
> +				   [IS_2SLOT] = 800},
> +		     [IS_UDIMM] = {
> +				   [IS_1SLOT] = 1050,
> +				   [IS_2SLOT] = 940},
> +		      },
> +	[IS_DDR3] = {
> +		     [IS_RDIMM] = {
> +				   [IS_1SLOT] = 0,	// disabled
> +				   [IS_2SLOT] = 0	// disabled
> +				   },
> +		     [IS_UDIMM] = {
> +				   [IS_1SLOT] = 0,	// disabled
> +				   [IS_2SLOT] = 0	// disabled
> +				}
> +		}
> +};
> +
> +void cvmx_maybe_tune_node(struct ddr_priv *priv, u32 ddr_speed)
> +{
> +	const char *s;
> +	union cvmx_lmcx_config lmc_config;
> +	union cvmx_lmcx_control lmc_control;
> +	union cvmx_lmcx_ddr_pll_ctl lmc_ddr_pll_ctl;
> +	int is_ddr4;
> +	int is_rdimm;
> +	int is_1slot;
> +	int do_tune = 0;
> +	u32 ddr_min_speed;
> +	int node = 0;
> +
> +	// scale it down from Hz to MHz
> +	ddr_speed = divide_nint(ddr_speed, 1000000);
> +
> +	// FIXME: allow an override here so that all configs can be tuned
> +	// or none
> +	// If the envvar is defined, always either force it or avoid it
> +	// accordingly
> +	s = env_get("ddr_tune_all_configs");
> +	if (s) {
> +		do_tune = !!strtoul(s, NULL, 0);
> +		printf("N%d: DRAM auto-tuning %s.\n", node,
> +		       (do_tune) ? "forced" : "disabled");
> +		if (do_tune)
> +			cvmx_tune_node(priv);
> +
> +		return;
> +	}
> +
> +	// filter the tuning calls here...
> +	// determine if we should/can run automatically for this configuration
> +	//
> +	// FIXME: tune only when the configuration indicates it will help:
> +	//    DDR type, RDIMM or UDIMM, 1-slot or 2-slot, and speed
> +	//
> +	lmc_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(0));	// sample LMC0
> +	lmc_control.u64 = lmc_rd(priv, CVMX_LMCX_CONTROL(0));	// sample LMC0
> +	// sample LMC0
> +	lmc_ddr_pll_ctl.u64 = lmc_rd(priv, CVMX_LMCX_DDR_PLL_CTL(0));
> +
> +	is_ddr4 = (lmc_ddr_pll_ctl.s.ddr4_mode != 0);
> +	is_rdimm = (lmc_control.s.rdimm_ena != 0);
> +	// HACK, should do better
> +	is_1slot = (lmc_config.s.init_status < 4);
> +
> +	ddr_min_speed = ddr_speed_filter[is_ddr4][is_rdimm][is_1slot];
> +	do_tune = ((ddr_min_speed != 0) && (ddr_speed > ddr_min_speed));
> +
> +	debug("N%d: DDR%d %cDIMM %d-slot at %d MHz %s eligible for auto-tuning.\n",
> +	      node, (is_ddr4) ? 4 : 3, (is_rdimm) ? 'R' : 'U',
> +	      (is_1slot) ? 1 : 2, ddr_speed, (do_tune) ? "is" : "is not");
> +
> +	// call the tuning routine, filtering is done...
> +	if (do_tune)
> +		cvmx_tune_node(priv);
> +}
> +
> +/*
> + * first pattern example:
> + * GENERAL_PURPOSE0.DATA == 64'h00ff00ff00ff00ff;
> + * GENERAL_PURPOSE1.DATA == 64'h00ff00ff00ff00ff;
> + * GENERAL_PURPOSE0.DATA == 16'h0000;
> + */
> +
> +static const u64 dbi_pattern[3] = {
> +	0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL, 0x0000ULL };
> +
> +// Perform switchover to DBI
> +static void cvmx_dbi_switchover_interface(struct ddr_priv *priv, int lmc)
> +{
> +	union cvmx_lmcx_modereg_params0 modereg_params0;
> +	union cvmx_lmcx_modereg_params3 modereg_params3;
> +	union cvmx_lmcx_phy_ctl phy_ctl;
> +	union cvmx_lmcx_config lmcx_config;
> +	union cvmx_lmcx_ddr_pll_ctl ddr_pll_ctl;
> +	int rank_mask, rankx, active_ranks;
> +	u64 phys_addr, rank_offset;
> +	int num_lmcs, errors;
> +	int dbi_settings[9], byte, unlocked, retries;
> +	int ecc_ena;
> +	int rank_max = 1;	// FIXME: make this 4 to try all the ranks
> +	int node = 0;
> +
> +	ddr_pll_ctl.u64 = lmc_rd(priv, CVMX_LMCX_DDR_PLL_CTL(0));
> +
> +	lmcx_config.u64 = lmc_rd(priv, CVMX_LMCX_CONFIG(lmc));
> +	rank_mask = lmcx_config.s.init_status;
> +	ecc_ena = lmcx_config.s.ecc_ena;
> +
> +	// FIXME: must filter out any non-supported configs
> +	//        ie, no DDR3, no x4 devices
> +	if (ddr_pll_ctl.s.ddr4_mode == 0 || lmcx_config.s.mode_x4dev == 1) {
> +		debug("N%d.LMC%d: DBI switchover: inappropriate device; EXITING...\n",
> +		      node, lmc);
> +		return;
> +	}
> +
> +	// this should be correct for 1 or 2 ranks, 1 or 2 DIMMs
> +	num_lmcs = cvmx_dram_get_num_lmc(priv);
> +	rank_offset = 1ull << (28 + lmcx_config.s.pbank_lsb -
> +			       lmcx_config.s.rank_ena + (num_lmcs / 2));
> +
> +	debug("N%d.LMC%d: DBI switchover: rank mask 0x%x, rank size 0x%016llx.\n",
> +	      node, lmc, rank_mask, (unsigned long long)rank_offset);
> +
> +	/*
> +	 * 1. conduct the current init sequence as usual all the way
> +	 * after software write leveling.
> +	 */
> +
> +	read_dac_dbi_settings(priv, lmc, /*DBI*/ 0, dbi_settings);
> +
> +	display_dac_dbi_settings(lmc, /*DBI*/ 0, ecc_ena, dbi_settings,
> +				 " INIT");
> +
> +	/*
> +	 * 2. set DBI related CSRs as below and issue MR write.
> +	 * MODEREG_PARAMS3.WR_DBI=1
> +	 * MODEREG_PARAMS3.RD_DBI=1
> +	 * PHY_CTL.DBI_MODE_ENA=1
> +	 */
> +	modereg_params0.u64 = lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS0(lmc));
> +
> +	modereg_params3.u64 = lmc_rd(priv, CVMX_LMCX_MODEREG_PARAMS3(lmc));
> +	modereg_params3.s.wr_dbi = 1;
> +	modereg_params3.s.rd_dbi = 1;
> +	lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS3(lmc), modereg_params3.u64);
> +
> +	phy_ctl.u64 = lmc_rd(priv, CVMX_LMCX_PHY_CTL(lmc));
> +	phy_ctl.s.dbi_mode_ena = 1;
> +	lmc_wr(priv, CVMX_LMCX_PHY_CTL(lmc), phy_ctl.u64);
> +
> +	/*
> +	 * there are two options for data to send.  Lets start with (1)
> +	 * and could move to (2) in the future:
> +	 *
> +	 * 1) DBTRAIN_CTL[LFSR_PATTERN_SEL] = 0 (or for older chips where
> +	 * this does not exist) set data directly in these reigsters.
> +	 * this will yield a clk/2 pattern:
> +	 * GENERAL_PURPOSE0.DATA == 64'h00ff00ff00ff00ff;
> +	 * GENERAL_PURPOSE1.DATA == 64'h00ff00ff00ff00ff;
> +	 * GENERAL_PURPOSE0.DATA == 16'h0000;
> +	 * 2) DBTRAIN_CTL[LFSR_PATTERN_SEL] = 1
> +	 * here data comes from the LFSR generating a PRBS pattern
> +	 * CHAR_CTL.EN = 0
> +	 * CHAR_CTL.SEL = 0; // for PRBS
> +	 * CHAR_CTL.DR = 1;
> +	 * CHAR_CTL.PRBS = setup for whatever type of PRBS to send
> +	 * CHAR_CTL.SKEW_ON = 1;
> +	 */
> +	lmc_wr(priv, CVMX_LMCX_GENERAL_PURPOSE0(lmc), dbi_pattern[0]);
> +	lmc_wr(priv, CVMX_LMCX_GENERAL_PURPOSE1(lmc), dbi_pattern[1]);
> +	lmc_wr(priv, CVMX_LMCX_GENERAL_PURPOSE2(lmc), dbi_pattern[2]);
> +
> +	/*
> +	 * 3. adjust cas_latency (only necessary if RD_DBI is set).
> +	 * here is my code for doing this:
> +	 *
> +	 * if (csr_model.MODEREG_PARAMS3.RD_DBI.value == 1) begin
> +	 * case (csr_model.MODEREG_PARAMS0.CL.value)
> +	 * 0,1,2,3,4: csr_model.MODEREG_PARAMS0.CL.value += 2;
> +	 * // CL 9-13 -> 11-15
> +	 * 5: begin
> +	 * // CL=14, CWL=10,12 gets +2, CLW=11,14 gets +3
> +	 * if((csr_model.MODEREG_PARAMS0.CWL.value==1 ||
> +	 * csr_model.MODEREG_PARAMS0.CWL.value==3))
> +	 * csr_model.MODEREG_PARAMS0.CL.value = 7; // 14->16
> +	 * else
> +	 * csr_model.MODEREG_PARAMS0.CL.value = 13; // 14->17
> +	 * end
> +	 * 6: csr_model.MODEREG_PARAMS0.CL.value = 8; // 15->18
> +	 * 7: csr_model.MODEREG_PARAMS0.CL.value = 14; // 16->19
> +	 * 8: csr_model.MODEREG_PARAMS0.CL.value = 15; // 18->21
> +	 * default:
> +	 * `cn_fatal(("Error mem_cfg (%s) CL (%d) with RD_DBI=1,
> +	 * I am not sure what to do.",
> +	 * mem_cfg, csr_model.MODEREG_PARAMS3.RD_DBI.value))
> +	 * endcase
> +	 * end
> +	 */
> +
> +	if (modereg_params3.s.rd_dbi == 1) {
> +		int old_cl, new_cl, old_cwl;
> +
> +		old_cl = modereg_params0.s.cl;
> +		old_cwl = modereg_params0.s.cwl;
> +
> +		switch (old_cl) {
> +		case 0:
> +		case 1:
> +		case 2:
> +		case 3:
> +		case 4:
> +			new_cl = old_cl + 2;
> +			break;	// 9-13->11-15
> +			// CL=14, CWL=10,12 gets +2, CLW=11,14 gets +3
> +		case 5:
> +			new_cl = ((old_cwl == 1) || (old_cwl == 3)) ? 7 : 13;
> +			break;
> +		case 6:
> +			new_cl = 8;
> +			break;	// 15->18
> +		case 7:
> +			new_cl = 14;
> +			break;	// 16->19
> +		case 8:
> +			new_cl = 15;
> +			break;	// 18->21
> +		default:
> +			printf("ERROR: Bad CL value (%d) for DBI switchover.\n",
> +			       old_cl);
> +			// FIXME: need to error exit here...
> +			old_cl = -1;
> +			new_cl = -1;
> +			break;
> +		}
> +		debug("N%d.LMC%d: DBI switchover: CL ADJ: old_cl 0x%x, old_cwl 0x%x, new_cl 0x%x.\n",
> +		      node, lmc, old_cl, old_cwl, new_cl);
> +		modereg_params0.s.cl = new_cl;
> +		lmc_wr(priv, CVMX_LMCX_MODEREG_PARAMS0(lmc),
> +		       modereg_params0.u64);
> +	}
> +
> +	/*
> +	 * 4. issue MRW to MR0 (CL) and MR5 (DBI), using LMC sequence
> +	 * SEQ_CTL[SEQ_SEL] = MRW.
> +	 */
> +	// Use the default values, from the CSRs fields
> +	// also, do B-sides for RDIMMs...
> +
> +	for (rankx = 0; rankx < 4; rankx++) {
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +
> +		// for RDIMMs, B-side writes should get done automatically
> +		// when the A-side is written
> +		ddr4_mrw(priv, lmc, rankx, -1 /* use_default */,
> +			 0 /*MRreg */, 0 /*A-side */);	/* MR0 */
> +		ddr4_mrw(priv, lmc, rankx, -1 /* use_default */,
> +			 5 /*MRreg */, 0 /*A-side */);	/* MR5 */
> +	}
> +
> +	/*
> +	 * 5. conduct DBI bit deskew training via the General Purpose
> +	 * R/W sequence (dbtrain). may need to run this over and over to get
> +	 * a lock (I need up to 5 in simulation):
> +	 * SEQ_CTL[SEQ_SEL] = RW_TRAINING (15)
> +	 * DBTRAIN_CTL.CMD_COUNT_EXT = all 1's
> +	 * DBTRAIN_CTL.READ_CMD_COUNT = all 1's
> +	 * DBTRAIN_CTL.TCCD_SEL = set according to MODEREG_PARAMS3[TCCD_L]
> +	 * DBTRAIN_CTL.RW_TRAIN = 1
> +	 * DBTRAIN_CTL.READ_DQ_COUNT = dont care
> +	 * DBTRAIN_CTL.WRITE_ENA = 1;
> +	 * DBTRAIN_CTL.ACTIVATE = 1;
> +	 * DBTRAIN_CTL LRANK, PRANK, ROW_A, BG, BA, COLUMN_A = set to a
> +	 * valid address
> +	 */
> +
> +	// NOW - do the training
> +	debug("N%d.LMC%d: DBI switchover: TRAINING begins...\n", node, lmc);
> +
> +	active_ranks = 0;
> +	for (rankx = 0; rankx < rank_max; rankx++) {
> +		if (!(rank_mask & (1 << rankx)))
> +			continue;
> +
> +		phys_addr = rank_offset * active_ranks;
> +		// FIXME: now done by test_dram_byte_hw()
> +
> +		active_ranks++;
> +
> +		retries = 0;
> +
> +restart_training:
> +
> +		// NOTE: return is a bitmask of the erroring bytelanes -
> +		// we only print it
> +		errors =
> +		    test_dram_byte_hw(priv, lmc, phys_addr, DBTRAIN_DBI, NULL);
> +
> +		debug("N%d.LMC%d: DBI switchover: TEST: rank %d, phys_addr 0x%llx, errors 0x%x.\n",
> +		      node, lmc, rankx, (unsigned long long)phys_addr, errors);
> +
> +		// NEXT - check for locking
> +		unlocked = 0;
> +		read_dac_dbi_settings(priv, lmc, /*DBI*/ 0, dbi_settings);
> +
> +		for (byte = 0; byte < (8 + ecc_ena); byte++)
> +			unlocked += (dbi_settings[byte] & 1) ^ 1;
> +
> +		// FIXME: print out the DBI settings array after each rank?
> +		if (rank_max > 1)	// only when doing more than 1 rank
> +			display_dac_dbi_settings(lmc, /*DBI*/ 0, ecc_ena,
> +						 dbi_settings, " RANK");
> +
> +		if (unlocked > 0) {
> +			debug("N%d.LMC%d: DBI switchover: LOCK: %d still unlocked.\n",
> +			      node, lmc, unlocked);
> +			retries++;
> +			if (retries < 10) {
> +				goto restart_training;
> +			} else {
> +				debug("N%d.LMC%d: DBI switchover: LOCK: %d retries exhausted.\n",
> +				      node, lmc, retries);
> +			}
> +		}
> +	}			/* for (rankx = 0; rankx < 4; rankx++) */
> +
> +	// print out the final DBI settings array
> +	display_dac_dbi_settings(lmc, /*DBI*/ 0, ecc_ena, dbi_settings,
> +				 "FINAL");
> +}
> +
> +void cvmx_dbi_switchover(struct ddr_priv *priv)
> +{
> +	int lmc;
> +	int num_lmcs = cvmx_dram_get_num_lmc(priv);
> +
> +	for (lmc = 0; lmc < num_lmcs; lmc++)
> +		cvmx_dbi_switchover_interface(priv, lmc);
> +}
-- 
- Daniel



More information about the U-Boot mailing list