From ef9d61ae9ed490301adf9ee064a9fc1190069d81 Mon Sep 17 00:00:00 2001 From: Mate Kukri Date: Mon, 20 May 2024 11:43:35 +0100 Subject: Add native NVMe driver based on SeaBIOS Tested to successfully boot Debian on QEMU and OptiPlex 3050. Signed-off-by: Mate Kukri --- Makefile.am | 2 +- grub-core/Makefile.core.def | 6 + grub-core/commands/nativedisk.c | 1 + grub-core/disk/nvme-int.h | 208 +++++++++ grub-core/disk/nvme.c | 781 ++++++++++++++++++++++++++++++++ include/grub/disk.h | 1 + 6 files changed, 998 insertions(+), 1 deletion(-) create mode 100644 grub-core/disk/nvme-int.h create mode 100644 grub-core/disk/nvme.c diff --git a/Makefile.am b/Makefile.am index 65016f856..7bc0866ba 100644 --- a/Makefile.am +++ b/Makefile.am @@ -434,7 +434,7 @@ if COND_i386_coreboot FS_PAYLOAD_MODULES ?= $(shell cat grub-core/fs.lst) default_payload.elf: grub-mkstandalone grub-mkimage FORCE test -f $@ && rm $@ || true - pkgdatadir=. ./grub-mkstandalone --grub-mkimage=./grub-mkimage -O i386-coreboot -o $@ --modules='ahci pata xhci ehci uhci ohci usb_keyboard usbms part_msdos ext2 fat at_keyboard part_gpt usbserial_usbdebug cbfs' --install-modules='ls linux search configfile normal cbtime cbls memrw iorw minicmd lsmmap lspci halt reboot hexdump pcidump regexp setpci lsacpi chain test serial multiboot cbmemc linux16 gzio echo help syslinuxcfg xnu $(FS_PAYLOAD_MODULES) password_pbkdf2 $(EXTRA_PAYLOAD_MODULES)' --fonts= --themes= --locales= -d grub-core/ /boot/grub/grub.cfg=$(srcdir)/coreboot.cfg + pkgdatadir=. ./grub-mkstandalone --grub-mkimage=./grub-mkimage -O i386-coreboot -o $@ --modules='ahci pata nvme xhci ehci uhci ohci usb_keyboard usbms part_msdos ext2 fat at_keyboard part_gpt usbserial_usbdebug cbfs' --install-modules='ls linux search configfile normal cbtime cbls memrw iorw minicmd lsmmap lspci halt reboot hexdump pcidump regexp setpci lsacpi chain test serial multiboot cbmemc linux16 gzio echo help syslinuxcfg xnu $(FS_PAYLOAD_MODULES) password_pbkdf2 $(EXTRA_PAYLOAD_MODULES)' --fonts= --themes= --locales= -d grub-core/ /boot/grub/grub.cfg=$(srcdir)/coreboot.cfg endif endif diff --git a/grub-core/Makefile.core.def b/grub-core/Makefile.core.def index ea782666d..0f893369a 100644 --- a/grub-core/Makefile.core.def +++ b/grub-core/Makefile.core.def @@ -2602,3 +2602,9 @@ module = { efi = commands/bli.c; enable = efi; }; + +module = { + name = nvme; + common = disk/nvme.c; + enable = pci; +}; diff --git a/grub-core/commands/nativedisk.c b/grub-core/commands/nativedisk.c index 580c8d3b0..a2c766fbd 100644 --- a/grub-core/commands/nativedisk.c +++ b/grub-core/commands/nativedisk.c @@ -78,6 +78,7 @@ get_uuid (const char *name, char **uuid, int getnative) case GRUB_DISK_DEVICE_ATA_ID: case GRUB_DISK_DEVICE_SCSI_ID: case GRUB_DISK_DEVICE_XEN: + case GRUB_DISK_DEVICE_NVME_ID: if (getnative) break; /* FALLTHROUGH */ diff --git a/grub-core/disk/nvme-int.h b/grub-core/disk/nvme-int.h new file mode 100644 index 000000000..1295b58aa --- /dev/null +++ b/grub-core/disk/nvme-int.h @@ -0,0 +1,208 @@ +// NVMe datastructures and constants +// +// Copyright 2017 Amazon.com, Inc. or its affiliates. +// +// This file may be distributed under the terms of the GNU LGPLv3 license. + +#ifndef __NVME_INT_H +#define __NVME_INT_H + +#include + +/* Data structures */ + +/* The register file of a NVMe host controller. This struct follows the naming + scheme in the NVMe specification. */ +struct nvme_reg { + grub_uint64_t cap; /* controller capabilities */ + grub_uint32_t vs; /* version */ + grub_uint32_t intms; /* interrupt mask set */ + grub_uint32_t intmc; /* interrupt mask clear */ + grub_uint32_t cc; /* controller configuration */ + grub_uint32_t _res0; + grub_uint32_t csts; /* controller status */ + grub_uint32_t _res1; + grub_uint32_t aqa; /* admin queue attributes */ + grub_uint64_t asq; /* admin submission queue base address */ + grub_uint64_t acq; /* admin completion queue base address */ +}; + +/* Submission queue entry */ +struct nvme_sqe { + union { + grub_uint32_t dword[16]; + struct { + grub_uint32_t cdw0; /* Command DWORD 0 */ + grub_uint32_t nsid; /* Namespace ID */ + grub_uint64_t _res0; + grub_uint64_t mptr; /* metadata ptr */ + + grub_uint64_t dptr_prp1; + grub_uint64_t dptr_prp2; + }; + }; +}; + +/* Completion queue entry */ +struct nvme_cqe { + union { + grub_uint32_t dword[4]; + struct { + grub_uint32_t cdw0; + grub_uint32_t _res0; + grub_uint16_t sq_head; + grub_uint16_t sq_id; + grub_uint16_t cid; + grub_uint16_t status; + }; + }; +}; + +/* The common part of every submission or completion queue. */ +struct nvme_queue { + grub_uint32_t *dbl; /* doorbell */ + grub_uint16_t mask; /* length - 1 */ +}; + +struct nvme_cq { + struct nvme_queue common; + struct nvme_cqe *cqe; + + /* We have read upto (but not including) this entry in the queue. */ + grub_uint16_t head; + + /* The current phase bit the controller uses to indicate that it has written + a new entry. This is inverted after each wrap. */ + unsigned phase : 1; +}; + +struct nvme_sq { + struct nvme_queue common; + struct nvme_sqe *sqe; + + /* Corresponding completion queue. We only support a single SQ per CQ. */ + struct nvme_cq *cq; + + /* The last entry the controller has fetched. */ + grub_uint16_t head; + + /* The last value we have written to the tail doorbell. */ + grub_uint16_t tail; +}; + +struct nvme_ctrl { + grub_pci_device_t pci; + struct nvme_reg volatile *reg; + + grub_uint32_t ctrlnum; + + grub_uint32_t doorbell_stride; /* in bytes */ + + struct nvme_sq admin_sq; + struct nvme_cq admin_cq; + + grub_uint32_t ns_count; + + struct nvme_sq io_sq; + struct nvme_cq io_cq; +}; + +struct nvme_namespace { + struct nvme_namespace *next; + struct nvme_namespace **prev; + + char *devname; + + grub_uint32_t nsnum; + + struct nvme_ctrl *ctrl; + + grub_uint32_t ns_id; + + grub_uint64_t lba_count; /* The total amount of sectors. */ + + grub_uint32_t block_size; + grub_uint32_t metadata_size; + grub_uint32_t max_req_size; +}; + +/* Data structures for NVMe admin identify commands */ + +struct nvme_identify_ctrl { + grub_uint16_t vid; + grub_uint16_t ssvid; + char sn[20]; + char mn[40]; + char fr[8]; + + grub_uint8_t rab; + grub_uint8_t ieee[3]; + grub_uint8_t cmic; + grub_uint8_t mdts; + + char _boring[516 - 78]; + + grub_uint32_t nn; /* number of namespaces */ +}; + +struct nvme_identify_ns_list { + grub_uint32_t ns_id[1024]; +}; + +struct nvme_lba_format { + grub_uint16_t ms; + grub_uint8_t lbads; + grub_uint8_t rp; +}; + +struct nvme_identify_ns { + grub_uint64_t nsze; + grub_uint64_t ncap; + grub_uint64_t nuse; + grub_uint8_t nsfeat; + grub_uint8_t nlbaf; + grub_uint8_t flbas; + + char _boring[128 - 27]; + + struct nvme_lba_format lbaf[16]; +}; + +union nvme_identify { + struct nvme_identify_ns ns; + struct nvme_identify_ctrl ctrl; + struct nvme_identify_ns_list ns_list; +}; + +/* NVMe constants */ + +#define NVME_CAP_CSS_NVME (1ULL << 37) + +#define NVME_CSTS_FATAL (1U << 1) +#define NVME_CSTS_RDY (1U << 0) + +#define NVME_CC_EN (1U << 0) + +#define NVME_SQE_OPC_ADMIN_CREATE_IO_SQ 1U +#define NVME_SQE_OPC_ADMIN_CREATE_IO_CQ 5U +#define NVME_SQE_OPC_ADMIN_IDENTIFY 6U + +#define NVME_SQE_OPC_IO_WRITE 1U +#define NVME_SQE_OPC_IO_READ 2U + +#define NVME_ADMIN_IDENTIFY_CNS_ID_NS 0U +#define NVME_ADMIN_IDENTIFY_CNS_ID_CTRL 1U +#define NVME_ADMIN_IDENTIFY_CNS_GET_NS_LIST 2U + +#define NVME_CQE_DW3_P (1U << 16) + +#define NVME_PAGE_SIZE 4096 +#define NVME_PAGE_MASK ~(NVME_PAGE_SIZE - 1) + +/* Length for the queue entries. */ +#define NVME_SQE_SIZE_LOG 6 +#define NVME_CQE_SIZE_LOG 4 + +#endif + +/* EOF */ diff --git a/grub-core/disk/nvme.c b/grub-core/disk/nvme.c new file mode 100644 index 000000000..093237c70 --- /dev/null +++ b/grub-core/disk/nvme.c @@ -0,0 +1,781 @@ +// Low level NVMe disk access +// +// Based on SeaBIOS NVMe driver - Copyright 2017 Amazon.com, Inc. or its affiliates. +// Port to GRUB2 done by Mate Kukri +// +// This file may be distributed under the terms of the GNU LGPLv3 license. + +#include +#include +#include +#include "nvme-int.h" + +GRUB_MOD_LICENSE ("GPLv3"); /* LGPLv3 in reality but it is GPLv3 compatible */ + +static grub_uint32_t grub_nvme_ctrlcnt; +static grub_uint32_t grub_nvme_nscnt; + +static struct nvme_namespace *grub_nvme_namespaces; + +// Page aligned "dma bounce buffer" of size NVME_PAGE_SIZE +static void *nvme_dma_buffer; + +static void * +zalloc_page_aligned(grub_uint32_t size) +{ + void *res = grub_memalign(NVME_PAGE_SIZE, size); + if (res) grub_memset(res, 0, size); + return res; +} + +static void +nvme_init_queue_common(struct nvme_ctrl *ctrl, struct nvme_queue *q, grub_uint16_t q_idx, + grub_uint16_t length) +{ + grub_memset(q, 0, sizeof(*q)); + q->dbl = (grub_uint32_t *)((char *)ctrl->reg + 0x1000 + q_idx * ctrl->doorbell_stride); + grub_dprintf("nvme", " q %p q_idx %u dbl %p\n", q, q_idx, q->dbl); + q->mask = length - 1; +} + +static int +nvme_init_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, grub_uint16_t q_idx, grub_uint16_t length, + struct nvme_cq *cq) +{ + nvme_init_queue_common(ctrl, &sq->common, q_idx, length); + sq->sqe = zalloc_page_aligned(sizeof(*sq->sqe) * length); + + if (!sq->sqe) { + return -1; + } + + grub_dprintf("nvme", "sq %p q_idx %u sqe %p\n", sq, q_idx, sq->sqe); + sq->cq = cq; + sq->head = 0; + sq->tail = 0; + + return 0; +} + +static int +nvme_init_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, grub_uint16_t q_idx, grub_uint16_t length) +{ + nvme_init_queue_common(ctrl, &cq->common, q_idx, length); + cq->cqe = zalloc_page_aligned(sizeof(*cq->cqe) * length); + if (!cq->cqe) { + return -1; + } + + cq->head = 0; + + /* All CQE phase bits are initialized to zero. This means initially we wait + for the host controller to set these to 1. */ + cq->phase = 1; + + return 0; +} + +static int +nvme_poll_cq(struct nvme_cq *cq) +{ + grub_uint32_t dw3 = *(volatile grub_uint32_t *) &cq->cqe[cq->head].dword[3]; + return (!!(dw3 & NVME_CQE_DW3_P) == cq->phase); +} + +static int +nvme_is_cqe_success(struct nvme_cqe const *cqe) +{ + return ((cqe->status >> 1) & 0xFF) == 0; +} + +static struct nvme_cqe +nvme_error_cqe(void) +{ + struct nvme_cqe r; + + /* 0xFF is a vendor specific status code != success. Should be okay for + indicating failure. */ + grub_memset(&r, 0xFF, sizeof(r)); + return r; +} + +static struct nvme_cqe +nvme_consume_cqe(struct nvme_sq *sq) +{ + struct nvme_cq *cq = sq->cq; + + if (!nvme_poll_cq(cq)) { + /* Cannot consume a completion queue entry, if there is none ready. */ + return nvme_error_cqe(); + } + + struct nvme_cqe *cqe = &cq->cqe[cq->head]; + grub_uint16_t cq_next_head = (cq->head + 1) & cq->common.mask; + grub_dprintf("nvme", "cq %p head %u -> %u\n", cq, cq->head, cq_next_head); + if (cq_next_head < cq->head) { + grub_dprintf("nvme", "cq %p wrap\n", cq); + cq->phase = ~cq->phase; + } + cq->head = cq_next_head; + + /* Update the submission queue head. */ + if (cqe->sq_head != sq->head) { + sq->head = cqe->sq_head; + grub_dprintf("nvme", "sq %p advanced to %u\n", sq, cqe->sq_head); + } + + /* Tell the controller that we consumed the completion. */ + *(volatile grub_uint32_t *) cq->common.dbl = cq->head; + + return *cqe; +} + +static struct nvme_cqe +nvme_wait(struct nvme_sq *sq) +{ + // static const unsigned nvme_timeout = 5000 /* ms */; + // grub_uint32_t to = timer_calc(nvme_timeout); + while (!nvme_poll_cq(sq->cq)) { + /* FIXME + yield(); + + if (timer_check(to)) { + warn_timeout(); + return nvme_error_cqe(); + }*/ + } + + return nvme_consume_cqe(sq); +} + +/* Returns the next submission queue entry (or NULL if the queue is full). It + also fills out Command Dword 0 and clears the rest. */ +static struct nvme_sqe * +nvme_get_next_sqe(struct nvme_sq *sq, grub_uint8_t opc, void *metadata, void *data, void *data2) +{ + if (((sq->head + 1) & sq->common.mask) == sq->tail) { + grub_dprintf("nvme", "submission queue is full\n"); + return NULL; + } + + struct nvme_sqe *sqe = &sq->sqe[sq->tail]; + grub_dprintf("nvme", "sq %p next_sqe %u\n", sq, sq->tail); + + grub_memset(sqe, 0, sizeof(*sqe)); + sqe->cdw0 = opc | (sq->tail << 16 /* CID */); + sqe->mptr = (grub_uint32_t)metadata; + sqe->dptr_prp1 = (grub_uint32_t)data; + sqe->dptr_prp2 = (grub_uint32_t)data2; + + return sqe; +} + +/* Call this after you've filled out an sqe that you've got from nvme_get_next_sqe. */ +static void +nvme_commit_sqe(struct nvme_sq *sq) +{ + grub_dprintf("nvme", "sq %p commit_sqe %u\n", sq, sq->tail); + sq->tail = (sq->tail + 1) & sq->common.mask; + *(volatile grub_uint32_t *) sq->common.dbl = sq->tail; +} + +/* Perform an identify command on the admin queue and return the resulting + buffer. This may be a NULL pointer, if something failed. This function + cannot be used after initialization, because it uses buffers in tmp zone. */ +static union nvme_identify * +nvme_admin_identify(struct nvme_ctrl *ctrl, grub_uint8_t cns, grub_uint32_t nsid) +{ + union nvme_identify *identify_buf = zalloc_page_aligned(4096); + if (!identify_buf) + return NULL; + + struct nvme_sqe *cmd_identify; + cmd_identify = nvme_get_next_sqe(&ctrl->admin_sq, + NVME_SQE_OPC_ADMIN_IDENTIFY, NULL, + identify_buf, NULL); + if (!cmd_identify) + goto error; + + cmd_identify->nsid = nsid; + cmd_identify->dword[10] = cns; + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + goto error; + } + + return identify_buf; + error: + grub_free(identify_buf); + return NULL; +} + +static struct nvme_identify_ctrl * +nvme_admin_identify_ctrl(struct nvme_ctrl *ctrl) +{ + return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_CTRL, 0)->ctrl; +} + +static struct nvme_identify_ns * +nvme_admin_identify_ns(struct nvme_ctrl *ctrl, grub_uint32_t ns_id) +{ + return &nvme_admin_identify(ctrl, NVME_ADMIN_IDENTIFY_CNS_ID_NS, + ns_id)->ns; +} + +static void +nvme_probe_ns(struct nvme_ctrl *ctrl, grub_uint32_t ns_idx, grub_uint8_t mdts) +{ + grub_uint32_t ns_id = ns_idx + 1; + + struct nvme_identify_ns *id = nvme_admin_identify_ns(ctrl, ns_id); + if (!id) { + grub_dprintf("nvme", "NVMe couldn't identify namespace %u.\n", ns_id); + goto free_buffer; + } + + grub_uint8_t current_lba_format = id->flbas & 0xF; + if (current_lba_format > id->nlbaf) { + grub_dprintf("nvme", "NVMe NS %u: current LBA format %u is beyond what the " + " namespace supports (%u)?\n", + ns_id, current_lba_format, id->nlbaf + 1); + goto free_buffer; + } + + if (!id->nsze) { + grub_dprintf("nvme", "NVMe NS %u is inactive.\n", ns_id); + goto free_buffer; + } + + if (!nvme_dma_buffer) { + nvme_dma_buffer = zalloc_page_aligned(NVME_PAGE_SIZE); + if (!nvme_dma_buffer) { + goto free_buffer; + } + } + + struct nvme_namespace *ns = grub_malloc(sizeof(*ns)); + if (!ns) { + goto free_buffer; + } + grub_memset(ns, 0, sizeof(*ns)); + ns->ctrl = ctrl; + ns->ns_id = ns_id; + ns->lba_count = id->nsze; + + struct nvme_lba_format *fmt = &id->lbaf[current_lba_format]; + + ns->block_size = 1U << fmt->lbads; + ns->metadata_size = fmt->ms; + + if (ns->block_size > NVME_PAGE_SIZE) { + /* If we see devices that trigger this path, we need to increase our + buffer size. */ + grub_free(ns); + goto free_buffer; + } + + if (mdts) { + ns->max_req_size = ((1U << mdts) * NVME_PAGE_SIZE) / ns->block_size; + grub_dprintf("nvme", "NVME NS %u max request size: %d sectors\n", + ns_id, ns->max_req_size); + } else { + ns->max_req_size = -1U; + } + + ns->devname = grub_xasprintf("nvme%un%u", ctrl->ctrlnum, ns_id); + ns->nsnum = grub_nvme_nscnt++; + + grub_list_push (GRUB_AS_LIST_P (&grub_nvme_namespaces), GRUB_AS_LIST (ns)); + +free_buffer: + grub_free(id); +} + + +/* Release memory allocated for a completion queue */ +static void +nvme_destroy_cq(struct nvme_cq *cq) +{ + grub_free(cq->cqe); + cq->cqe = NULL; +} + +/* Release memory allocated for a submission queue */ +static void +nvme_destroy_sq(struct nvme_sq *sq) +{ + grub_free(sq->sqe); + sq->sqe = NULL; +} + +/* Returns 0 on success. */ +static int +nvme_create_io_cq(struct nvme_ctrl *ctrl, struct nvme_cq *cq, grub_uint16_t q_idx) +{ + int rc; + struct nvme_sqe *cmd_create_cq; + grub_uint32_t length = 1 + (ctrl->reg->cap & 0xffff); + if (length > NVME_PAGE_SIZE / sizeof(struct nvme_cqe)) + length = NVME_PAGE_SIZE / sizeof(struct nvme_cqe); + + rc = nvme_init_cq(ctrl, cq, q_idx, length); + if (rc) { + goto err; + } + + cmd_create_cq = nvme_get_next_sqe(&ctrl->admin_sq, + NVME_SQE_OPC_ADMIN_CREATE_IO_CQ, NULL, + cq->cqe, NULL); + if (!cmd_create_cq) { + goto err_destroy_cq; + } + + cmd_create_cq->dword[10] = (cq->common.mask << 16) | (q_idx >> 1); + cmd_create_cq->dword[11] = 1 /* physically contiguous */; + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + grub_dprintf("nvme", "create io cq failed: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + + goto err_destroy_cq; + } + + return 0; + +err_destroy_cq: + nvme_destroy_cq(cq); +err: + return -1; +} + +/* Returns 0 on success. */ +static int +nvme_create_io_sq(struct nvme_ctrl *ctrl, struct nvme_sq *sq, grub_uint16_t q_idx, struct nvme_cq *cq) +{ + int rc; + struct nvme_sqe *cmd_create_sq; + grub_uint32_t length = 1 + (ctrl->reg->cap & 0xffff); + if (length > NVME_PAGE_SIZE / sizeof(struct nvme_cqe)) + length = NVME_PAGE_SIZE / sizeof(struct nvme_cqe); + + rc = nvme_init_sq(ctrl, sq, q_idx, length, cq); + if (rc) { + goto err; + } + + cmd_create_sq = nvme_get_next_sqe(&ctrl->admin_sq, + NVME_SQE_OPC_ADMIN_CREATE_IO_SQ, NULL, + sq->sqe, NULL); + if (!cmd_create_sq) { + goto err_destroy_sq; + } + + cmd_create_sq->dword[10] = (sq->common.mask << 16) | (q_idx >> 1); + cmd_create_sq->dword[11] = (q_idx >> 1) << 16 | 1 /* contiguous */; + grub_dprintf("nvme", "sq %p create dword10 %08x dword11 %08x\n", sq, + cmd_create_sq->dword[10], cmd_create_sq->dword[11]); + + nvme_commit_sqe(&ctrl->admin_sq); + + struct nvme_cqe cqe = nvme_wait(&ctrl->admin_sq); + + if (!nvme_is_cqe_success(&cqe)) { + grub_dprintf("nvme", "create io sq failed: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + goto err_destroy_sq; + } + + return 0; + +err_destroy_sq: + nvme_destroy_sq(sq); +err: + return -1; +} + +/* Reads count sectors into buf. The buffer cannot cross page boundaries. */ +static int +nvme_io_xfer(struct nvme_namespace *ns, grub_uint64_t lba, void *prp1, void *prp2, + grub_uint16_t count, int write) +{ + if (((grub_uint32_t)prp1 & 0x3) || ((grub_uint32_t)prp2 & 0x3)) { + /* Buffer is misaligned */ + return -1; + } + + struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq, + write ? NVME_SQE_OPC_IO_WRITE + : NVME_SQE_OPC_IO_READ, + NULL, prp1, prp2); + io_read->nsid = ns->ns_id; + io_read->dword[10] = (grub_uint32_t)lba; + io_read->dword[11] = (grub_uint32_t)(lba >> 32); + io_read->dword[12] = (1U << 31 /* limited retry */) | (count - 1); + + nvme_commit_sqe(&ns->ctrl->io_sq); + + struct nvme_cqe cqe = nvme_wait(&ns->ctrl->io_sq); + + if (!nvme_is_cqe_success(&cqe)) { + grub_dprintf("nvme", "read io: %08x %08x %08x %08x\n", + cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); + + return -1; + } + + grub_dprintf("nvme", "ns %u %s lba %llu+%u\n", ns->ns_id, write ? "write" : "read", + lba, count); + return count; +} + +// Transfer up to one page of data using the internal dma bounce buffer +static int +nvme_bounce_xfer(struct nvme_namespace *ns, grub_uint64_t lba, void *buf, grub_uint16_t count, + int write) +{ + grub_uint16_t const max_blocks = NVME_PAGE_SIZE / ns->block_size; + grub_uint16_t blocks = count < max_blocks ? count : max_blocks; + + if (write) + grub_memcpy(nvme_dma_buffer, buf, blocks * ns->block_size); + + int res = nvme_io_xfer(ns, lba, nvme_dma_buffer, NULL, blocks, write); + + if (!write && res >= 0) + grub_memcpy(buf, nvme_dma_buffer, res * ns->block_size); + + return res; +} + +#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */ + +// Transfer data using page list (if applicable) +static int +nvme_prpl_xfer(struct nvme_namespace *ns, grub_uint64_t lba, void *buf, grub_uint16_t count, + int write) +{ + grub_uint32_t base = (long)buf; + grub_int32_t size; + + if (count > ns->max_req_size) + count = ns->max_req_size; + + size = count * ns->block_size; + /* Special case for transfers that fit into PRP1, but are unaligned */ + if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) + goto single; + + /* Every request has to be page aligned */ + if (base & ~NVME_PAGE_MASK) + goto bounce; + + /* Make sure a full block fits into the last chunk */ + if (size & (ns->block_size - 1ULL)) + goto bounce; + + /* Build PRP list if we need to describe more than 2 pages */ + if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { + grub_uint32_t prpl_len = 0; + grub_uint64_t *prpl = nvme_dma_buffer; + int first_page = 1; + for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { + if (first_page) { + /* First page is special */ + first_page = 0; + continue; + } + if (prpl_len >= NVME_MAX_PRPL_ENTRIES) + goto bounce; + prpl[prpl_len++] = base; + } + return nvme_io_xfer(ns, lba, buf, prpl, count, write); + } + + /* Directly embed the 2nd page if we only need 2 pages */ + if ((ns->block_size * count) > NVME_PAGE_SIZE) + return nvme_io_xfer(ns, lba, buf, (char *) buf + NVME_PAGE_SIZE, count, write); + +single: + /* One page is enough, don't expose anything else */ + return nvme_io_xfer(ns, lba, buf, NULL, count, write); + +bounce: + /* Use bounce buffer to make transfer */ + return nvme_bounce_xfer(ns, lba, buf, count, write); +} + +static int +nvme_create_io_queues(struct nvme_ctrl *ctrl) +{ + if (nvme_create_io_cq(ctrl, &ctrl->io_cq, 3)) + goto err; + + if (nvme_create_io_sq(ctrl, &ctrl->io_sq, 2, &ctrl->io_cq)) + goto err_free_cq; + + return 0; + + err_free_cq: + nvme_destroy_cq(&ctrl->io_cq); + err: + return -1; +} + +/* Waits for CSTS.RDY to match rdy. Returns 0 on success. */ +static int +nvme_wait_csts_rdy(struct nvme_ctrl *ctrl, unsigned rdy) +{ + // grub_uint32_t const max_to = 500 /* ms */ * ((ctrl->reg->cap >> 24) & 0xFFU); + // grub_uint32_t to = timer_calc(max_to); + grub_uint32_t csts; + + while (rdy != ((csts = ctrl->reg->csts) & NVME_CSTS_RDY)) { + // FIXME + //yield(); + + if (csts & NVME_CSTS_FATAL) { + grub_dprintf("nvme", "NVMe fatal error during controller shutdown\n"); + return -1; + } + + /* + if (timer_check(to)) { + warn_timeout(); + return -1; + }*/ + } + + return 0; +} + +/* Returns 0 on success. */ +static int grub_nvme_controller_enable(struct nvme_ctrl *ctrl) +{ + grub_pci_address_t addr; + int rc; + + addr = grub_pci_make_address (ctrl->pci, GRUB_PCI_REG_COMMAND); + grub_pci_write_word (addr, grub_pci_read_word (addr) | GRUB_PCI_COMMAND_BUS_MASTER); + + /* Turn the controller off. */ + ctrl->reg->cc = 0; + if (nvme_wait_csts_rdy(ctrl, 0)) { + grub_dprintf("nvme", "NVMe fatal error during controller shutdown\n"); + return -1; + } + + ctrl->doorbell_stride = 4U << ((ctrl->reg->cap >> 32) & 0xF); + + rc = nvme_init_cq(ctrl, &ctrl->admin_cq, 1, + NVME_PAGE_SIZE / sizeof(struct nvme_cqe)); + if (rc) { + return -1; + } + + rc = nvme_init_sq(ctrl, &ctrl->admin_sq, 0, + NVME_PAGE_SIZE / sizeof(struct nvme_sqe), &ctrl->admin_cq); + if (rc) { + goto err_destroy_admin_cq; + } + + ctrl->reg->aqa = ctrl->admin_cq.common.mask << 16 + | ctrl->admin_sq.common.mask; + + ctrl->reg->asq = (grub_uint32_t)ctrl->admin_sq.sqe; + ctrl->reg->acq = (grub_uint32_t)ctrl->admin_cq.cqe; + + grub_dprintf("nvme", " admin submission queue: %p\n", ctrl->admin_sq.sqe); + grub_dprintf("nvme", " admin completion queue: %p\n", ctrl->admin_cq.cqe); + + ctrl->reg->cc = NVME_CC_EN | (NVME_CQE_SIZE_LOG << 20) + | (NVME_SQE_SIZE_LOG << 16 /* IOSQES */); + + if (nvme_wait_csts_rdy(ctrl, 1)) { + grub_dprintf("nvme", "NVMe fatal error while enabling controller\n"); + goto err_destroy_admin_sq; + } + + /* The admin queue is set up and the controller is ready. Let's figure out + what namespaces we have. */ + + struct nvme_identify_ctrl *identify = nvme_admin_identify_ctrl(ctrl); + + if (!identify) { + grub_dprintf("nvme", "NVMe couldn't identify controller.\n"); + goto err_destroy_admin_sq; + } + + grub_dprintf("nvme", "NVMe has %u namespace%s.\n", + identify->nn, (identify->nn == 1) ? "" : "s"); + + ctrl->ns_count = identify->nn; + grub_uint8_t mdts = identify->mdts; + grub_free(identify); + + if ((ctrl->ns_count == 0) || nvme_create_io_queues(ctrl)) { + /* No point to continue, if the controller says it doesn't have + namespaces or we couldn't create I/O queues. */ + goto err_destroy_admin_sq; + } + + /* Give the controller a global number */ + ctrl->ctrlnum = grub_nvme_ctrlcnt++; + + /* Populate namespace IDs */ + for (grub_uint32_t ns_idx = 0; ns_idx < ctrl->ns_count; ns_idx++) { + nvme_probe_ns(ctrl, ns_idx, mdts); + } + + grub_dprintf("nvme", "NVMe initialization complete!\n"); + return 0; + + err_destroy_admin_sq: + nvme_destroy_sq(&ctrl->admin_sq); + err_destroy_admin_cq: + nvme_destroy_cq(&ctrl->admin_cq); + return -1; +} + +static int grub_nvme_pci_probe(grub_pci_device_t dev, grub_pci_id_t pciid __attribute__ ((unused)), void *data __attribute__ ((unused))) +{ + grub_pci_address_t addr; + grub_uint32_t class, bar, version; + struct nvme_reg volatile *reg; + + class = grub_pci_read (grub_pci_make_address (dev, GRUB_PCI_REG_CLASS)); + if (class >> 16 != 0x0108) + return 0; + if ((class >> 8 & 0xff) != 2) { /* as of NVM 1.0e */ + grub_dprintf("nvme", "Found incompatble NVMe: prog-if=%02x\n", class >> 8 & 0xff); + return 0; + } + + bar = grub_pci_read (grub_pci_make_address (dev, GRUB_PCI_REG_ADDRESS_REG0)); + reg = grub_pci_device_map_range (dev, bar & GRUB_PCI_ADDR_MEM_MASK, sizeof (*reg)); + + addr = grub_pci_make_address (dev, GRUB_PCI_REG_COMMAND); + grub_pci_write_word (addr, grub_pci_read_word (addr) | GRUB_PCI_COMMAND_MEM_ENABLED); + + version = reg->vs; + grub_dprintf("nvme", "Found NVMe controller with version %u.%u.%u.\n", version >> 16, (version >> 8) & 0xFF, version & 0xFF); + grub_dprintf("nvme", " Capabilities %016llx\n", reg->cap); + + if (~reg->cap & NVME_CAP_CSS_NVME) { + grub_dprintf("nvme", "Controller doesn't speak NVMe command set. Skipping.\n"); + goto err; + } + + struct nvme_ctrl *ctrl = grub_malloc(sizeof(*ctrl)); + if (!ctrl) + goto err; + + grub_memset(ctrl, 0, sizeof(*ctrl)); + + ctrl->reg = reg; + ctrl->pci = dev; + + if (grub_nvme_controller_enable(ctrl)) + goto err_free_ctrl; + + return 0; + + err_free_ctrl: + grub_free(ctrl); + err: + grub_dprintf("nvme", "Failed to enable NVMe controller.\n"); + return 0; +} + +static int +grub_nvme_iterate (grub_disk_dev_iterate_hook_t hook, void *hook_data, grub_disk_pull_t pull) +{ + struct nvme_namespace *ns; + + if (pull != GRUB_DISK_PULL_NONE) + return 0; + + FOR_LIST_ELEMENTS(ns, grub_nvme_namespaces) + if (hook (ns->devname, hook_data)) + return 1; + + return 0; +} + +static grub_err_t +grub_nvme_open (const char *name __attribute ((unused)), grub_disk_t disk __attribute ((unused))) +{ + struct nvme_namespace *ns; + + FOR_LIST_ELEMENTS(ns, grub_nvme_namespaces) + if (grub_strcmp (ns->devname, name) == 0) + break; + + if (! ns) + return grub_error (GRUB_ERR_UNKNOWN_DEVICE, "can't open device"); + + disk->total_sectors = ns->lba_count; + disk->max_agglomerate = ns->max_req_size; + + disk->id = ns->nsnum; /* global id of the namespace */ + + disk->data = ns; + + return 0; +} + +static grub_err_t +nvme_readwrite(struct nvme_namespace *ns, grub_disk_addr_t sector, grub_size_t num_sectors, char *buf, int write) +{ + for (int i = 0; i < num_sectors;) { + grub_uint16_t blocks_remaining = num_sectors - i; + char *op_buf = buf + i * ns->block_size; + int blocks = nvme_prpl_xfer(ns, sector + i, op_buf, blocks_remaining, write); + if (blocks < 0) + return GRUB_ERR_IO; + i += blocks; + } + return GRUB_ERR_NONE; +} + +static grub_err_t +grub_nvme_read (grub_disk_t disk, grub_disk_addr_t sector, grub_size_t num_sectors, char *buf) +{ + return nvme_readwrite((struct nvme_namespace *) disk->data, sector, num_sectors, buf, 0); +} + +static grub_err_t +grub_nvme_write (grub_disk_t disk, grub_disk_addr_t sector, grub_size_t num_sectors, const char *buf) +{ + return nvme_readwrite((struct nvme_namespace *) disk->data, sector, num_sectors, buf, 1); +} + +static struct grub_disk_dev grub_nvme_dev = + { + .name = "nvme", + .id = GRUB_DISK_DEVICE_NVME_ID, + .disk_iterate = grub_nvme_iterate, + .disk_open = grub_nvme_open, + .disk_read = grub_nvme_read, + .disk_write = grub_nvme_write, + .next = 0 + }; + +GRUB_MOD_INIT(nvme) +{ + grub_stop_disk_firmware (); + grub_pci_iterate (grub_nvme_pci_probe, NULL); + grub_disk_dev_register (&grub_nvme_dev); +} + +GRUB_MOD_FINI(nvme) +{ + grub_disk_dev_unregister (&grub_nvme_dev); +} diff --git a/include/grub/disk.h b/include/grub/disk.h index fbf23df7f..186e76f0b 100644 --- a/include/grub/disk.h +++ b/include/grub/disk.h @@ -52,6 +52,7 @@ enum grub_disk_dev_id GRUB_DISK_DEVICE_UBOOTDISK_ID, GRUB_DISK_DEVICE_XEN, GRUB_DISK_DEVICE_OBDISK_ID, + GRUB_DISK_DEVICE_NVME_ID }; struct grub_disk; -- 2.39.2