Hi,我们这边在使用AGX Orin 64G(emmc型号DG4064)版本时出现了如下mmc故障报错,目前发现 在启动阶段出现的概率很高,在正常工作过程中也会小概率出现,Debug口输出的故障信息如下:

[57657.004920] WARNING: CPU: 8 PID: 123 at drivers/mmc/host/cqhci.c:1071 cqhci_0
[57657.014757] ---[ end trace b8b81fbd4983b9b2 ]---                             
[57657.019560] blk_update_request: I/O error, dev mmcblk0, sector 3050048 op 0x0
[57657.031078] Buffer I/O error on dev mmcblk0p1, logical block 0, lost sync pae
[57657.039150] EXT4-fs (mmcblk0p1): I/O error while writing superblock          
[57657.039213] mmc0: cqhci: CQE failed to exit halt state                       
[57657.039369] EXT4-fs (mmcblk0p1): previous I/O error to superblock detected   
[57657.045713] EXT4-fs error (device mmcblk0p1): ext4_dirty_inode:6002: inode #r
[57718.216890] mmc0: cqhci: timeout for tag 17                                  
[57718.221206] mmc0: cqhci: ============ CQHCI REGISTER DUMP ===========        
[57718.227841] mmc0: cqhci: Caps:      0x000020c8 | Version:  0x00000510        
[57718.234483] mmc0: cqhci: Config:    0x00000001 | Control:  0x00000100        
[57718.241110] mmc0: cqhci: Int stat:  0x00000000 | Int enab: 0x00000006        
[57718.247750] mmc0: cqhci: Int sig:   0x00000006 | Int Coal: 0x00000000        
[57718.254379] mmc0: cqhci: TDL base:  0x00000000 | TDL up32: 0x00000000        
[57718.261006] mmc0: cqhci: Doorbell:  0x001e0000 | TCN:      0x00000000        
[57718.267643] mmc0: cqhci: Dev queue: 0x00000000 | Dev Pend: 0x00000000        
[57718.274265] mmc0: cqhci: Task clr:  0x00000000 | SSC1:     0x00001000        
[57718.280903] mmc0: cqhci: SSC2:      0x00000000 | DCMD rsp: 0x00000800        
[57718.287529] mmc0: cqhci: RED mask:  0xfdf9a080 | TERRI:    0x002f000d        
[57718.294170] mmc0: cqhci: Resp idx:  0x0000002e | Resp arg: 0x00000900        
[57718.300811] mmc0: sdhci: ============ SDHCI REGISTER DUMP ===========        
[57718.307451] mmc0: sdhci: Sys addr:  0x00000000 | Version:  0x00000505        
[57718.314092] mmc0: sdhci: Blk size:  0x00007200 | Blk cnt:  0x00000000        
[57718.320731] mmc0: sdhci: Argument:  0x00010000 | Trn mode: 0x00000023        
[57718.327355] mmc0: sdhci: Present:   0x01fb00f0 | Host ctl: 0x00000038        
[57718.333993] mmc0: sdhci: Power:     0x00000001 | Blk gap:  0x00000000        
[57718.340635] mmc0: sdhci: Wake-up:   0x00000000 | Clock:    0x0000000f        
[57718.347257] mmc0: sdhci: Timeout:   0x0000000e | Int stat: 0x00000000        
[57718.353894] mmc0: sdhci: Int enab:  0x00ff0003 | Sig enab: 0x00fc0003        
[57718.360530] mmc0: sdhci: ACmd stat: 0x00000000 | Slot int: 0x00000000        
[57718.367169] mmc0: sdhci: Caps:      0x3f6cd08c | Caps_1:   0x18002f73        
[57718.373815] mmc0: sdhci: Cmd:       0x00000d1a | Max curr: 0x00000000        
[57718.380454] mmc0: sdhci: Resp[0]:   0x00000900 | Resp[1]:  0x0400658d        
[57718.387077] mmc0: sdhci: Resp[2]:   0x0fc30000 | Resp[3]:  0x00000240        
[57718.393718] mmc0: sdhci: Host ctl2: 0x0000300d                               
[57718.398297] mmc0: sdhci: ADMA Err:  0x00000000 | ADMA Ptr: 0x0000007fffffe210
[57718.405674] mmc0: sdhci: ============================================        
[57718.412313] mmc0: running CQE recovery                                       
[57718.428886] CPU:0, Error: cbb-fabric@0x13a00000, irq=32               

之前Nvidia反馈使用Jetpack 5.1.4验证,目前发现使用Jetpack512和Jetpack514都会出现这个问题
但是之前使用G1M15M这个型号的emmc的Orin模组是没有这个问题的

另外有一个信息是,使用fsck检查mmcblk0p1会有如下报警信息:麻烦看是否会对emmc工作有影响
fsck.log (1.5 KB)

上次出现问题的topic: “mmc0: cache flush error -110” during Orin work

详细的mmc故障日志:
mmc_fault.log (30.4 KB)

Hi,

Could you put the module into NV devkit and reproduce this with jetpack6.1?

Hi 您好,

我们把一颗Orin模块升级到了6.1在我们自己的硬件环境上进行复现,目前还未出现;同时我们自己的5.1.2的固件版本也在Nvidia开发板上做过重启验证,同样也复现了问题

不过,我们查看了DG4064这个型号eMMC的PCN(210100)变更说明,应该是Jetpack5.1.2是做过适配的,如下图

请问下,是否有哪些烧录环境的配置参数可能会导致这个问题吗,我们是否可以检查或者修改哪些烧录环境的参数来定位这个问题呢?

There was a old known issue for sdcard. But I think it should happen to eMMC as well as SDIO and SDMMC are sharing the same driver.

您好,升级6.1的的变动有点大,我们需要一些时间进行适配
请问下,此类问题是否还有其他的方式进行修复,比如5.1.2上是否有针对这个问题的补丁呢?可以作为一个过渡方案

for rel-35.6, please try these two patches.

patch 1:

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 421c670..350d706 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -242,13 +242,15 @@
 				   sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
 }
 
-static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
+static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
 {
+	int i;
 
-	*ptep = 0;
+	for (i = 0; i < num_entries; i++)
+		ptep[i] = 0;
 
-	if (!cfg->coherent_walk)
-		__arm_lpae_sync_pte(ptep, 1, cfg);
+	if (!cfg->coherent_walk && num_entries)
+		__arm_lpae_sync_pte(ptep, num_entries, cfg);
 }
 
 static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
@@ -610,7 +612,7 @@
 {
 	arm_lpae_iopte pte;
 	struct io_pgtable *iop = &data->iop;
-	int i = 0, num_entries, max_entries, unmap_idx_start;
+	int i = 0, j = 0, num_entries, max_entries, unmap_idx_start;
 
 	/* Something went horribly wrong and we ran out of page table */
 	if (WARN_ON(lvl == ARM_LPAE_MAX_LEVELS))
@@ -627,33 +629,28 @@
 		max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
 		num_entries = min_t(int, pgcount, max_entries);
 
-		while (i < num_entries) {
-			pte = READ_ONCE(*ptep);
+		/* Find and handle non-leaf entries */
+		for (i = 0; i < num_entries; i++) {
+			pte = READ_ONCE(ptep[i]);
 			if (WARN_ON(!pte))
 				break;
 
-			__arm_lpae_clear_pte(ptep, &iop->cfg);
-
 			if (!iopte_leaf(pte, lvl, iop->fmt)) {
+				__arm_lpae_clear_pte(&ptep[i], &iop->cfg, 1);
+
 				/* Also flush any partial walks */
 				io_pgtable_tlb_flush_walk(iop, iova + i * size, size,
 							  ARM_LPAE_GRANULE(data));
 				__arm_lpae_free_pgtable(data, lvl + 1, iopte_deref(pte, data));
-			} else if (iop->cfg.quirks & IO_PGTABLE_QUIRK_NON_STRICT) {
-				/*
-				 * Order the PTE update against queueing the IOVA, to
-				 * guarantee that a flush callback from a different CPU
-				 * has observed it before the TLBIALL can be issued.
-				 */
-				smp_wmb();
-			} else {
-				io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
 			}
-
-			ptep++;
-			i++;
 		}
 
+		/* Clear the remaining entries */
+		__arm_lpae_clear_pte(ptep, &iop->cfg, i);
+
+		for (j = 0; j < i; j++)
+			io_pgtable_tlb_add_page(iop, gather, iova + j * size, size);
+
 		return i * size;
 	} else if (iopte_leaf(pte, lvl, iop->fmt)) {
 		/*

patch2:

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 497c703..421c670 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -242,15 +242,13 @@
 				   sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
 }
 
-static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
+static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
 {
-	int i;
 
-	for (i = 0; i < num_entries; i++)
-		ptep[i] = 0;
+	*ptep = 0;
 
 	if (!cfg->coherent_walk)
-		__arm_lpae_sync_pte(ptep, num_entries, cfg);
+		__arm_lpae_sync_pte(ptep, 1, cfg);
 }
 
 static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
@@ -629,15 +627,13 @@
 		max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
 		num_entries = min_t(int, pgcount, max_entries);
 
-		pte = READ_ONCE(*ptep);
-		if (WARN_ON(!pte))
-			return 0;
-
-		__arm_lpae_clear_pte(ptep, &iop->cfg, num_entries);
 		while (i < num_entries) {
+			pte = READ_ONCE(*ptep);
 			if (WARN_ON(!pte))
 				break;
 
+			__arm_lpae_clear_pte(ptep, &iop->cfg);
+
 			if (!iopte_leaf(pte, lvl, iop->fmt)) {
 				/* Also flush any partial walks */
 				io_pgtable_tlb_flush_walk(iop, iova + i * size, size,

您好,请问下,是否有针对35.4.1的patch呢?
目前我们批量使用的是 Jetson Linux 35.4.1的版本。

您好,我们在35.6上打了这两个patch,使用nvbuild.sh编译了新的内核文件,并替换到升级了5.1.4的公版固件的Orin模组内,进行reboot压力测试,目前依然还是复现了mmc的问题.
但是我们使用6.1的公版固件,进行压力测试目前已经进行了约1周的时间,没有复现,看样子6.1上应该是修复这个问题,请问下是否有其他的patch修复这个问题