Browse Source

Merge branch 'topic/4.19/dra7-late-attach' of git://git.ti.com/rpmsg/remoteproc into rpmsg-ti-linux-4.19.y

Pull in a dedicated remoteproc topic branch that adds the infrastructure
support to allow "late-attach" of the IPU1 and IPU2 remote processors on
DRA7xx/AM57xx SoCs. The late-attach feature is the kernel counterpart to
the early-boot of these processors from U-Boot/SPL to achieve key system
response time goals.

The support is provided through temporary HACKs or workarounds to various
subsystems that includes the DMA API layers, remoteproc core, OMAP IOMMU,
OMAP DMTimer and OMAP remoteproc drivers. The early-booted remoteproc
devices need couple of DT modifications, and these are provided through
a common dtsi file. This common dtsi file needs to be included in the
desired target DRA7/AM57xx board dts files (not merged but available on
the above topic branch).

The following are the main HACKs and needs further rework:
 - Introduce new non-zeroing DMA API to allow the remoteproc core to
   still process the RSC_CARVEOUTs but not zero them (This will need to
   be replaced with static carveout support)
 - The new "late_attach" field added to remoteproc core needs to be
   replaced with equivalent fields already present that supports the
   K3 IPC-only mode and Keystone 2 Multi Proc Manager (MPM) stack.
 - The .device_is_enabled() logic is only temporary and applicable only
   with hwmod layers, this will need to be replaced with logic around
   reset status from OMAP PRM reset driver (next LTS).
 - The reserved memory regions used should actually be carveouts (DMA
   pools), and this needs some resource table RSC_CARVEOUT entries split
   on all the existing firmwares.
 - The addition of the DT modifications needs to be evaluated if it can
   be moved into U-Boot itself. The current logic requires that both
   U-Boot and kernel configs are matched, without which kernel crashes
   are seen.

* 'topic/4.19/dra7-late-attach' of git://git.ti.com/rpmsg/remoteproc:
  HACK: ARM: dts: dra7-ipu-common: Revert to CMA pools for IPU early boots
  TEMP: ARM: dts: dra7-ipu-common: Add a common file for IPU early boots
  TEMP: remoteproc/omap: add "late attach" support
  TEMP: iommu/omap: add support for performing a "late attach"
  ARM: OMAP2+: pdata-quirks: Add device_is_enabled ops for IOMMUs and rprocs
  clocksource: timer-ti-dm: Add support to handle late attach of rproc timers
  TEMP: remoteproc: call the non-zeroing allocation function
  TEMP: remoteproc: add "late attach" support
  HACK: dma-mapping: add non-zeroing coherent alloc function
  HACK: ARM: dma-mapping: create non-zeroing dma_map_ops

Signed-off-by: Suman Anna <s-anna@ti.com>
Suman Anna 6 years ago
parent
commit
c7010a0990

+ 70 - 0
arch/arm/boot/dts/dra7-ipu-common-early-boot.dtsi

@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2019 Texas Instruments Incorporated - http://www.ti.com/
+ *
+ * Common dtsi file that needs to be included in corresponding TI DRA7xx
+ * and AM57xx board dts files that have the IPU1 _and_ IPU2 remote processors
+ * booted early from TI U-Boot/SPL.
+ */
+
+/ {
+	reserved-memory {
+		mmu-early-page-tables@95700000 {
+			/* address need to match the usage within U-Boot */
+			reg = <0x0 0x95700000 0x0 0x100000>;
+			no-map;
+		};
+	};
+};
+
+/* IPU2 */
+&timer3 {
+	ti,no-idle-on-init;
+	ti,no-reset-on-init;
+};
+
+&timer4 {
+	ti,no-idle-on-init;
+	ti,no-reset-on-init;
+};
+
+&timer9 {
+	ti,no-idle-on-init;
+	ti,no-reset-on-init;
+};
+
+&mmu_ipu2{
+	ti,no-idle-on-init;
+	ti,no-reset-on-init;
+};
+
+&ipu2 {
+	ti,no-idle-on-init;
+	ti,no-reset-on-init;
+};
+
+/* IPU1 */
+&timer11 {
+	ti,no-idle-on-init;
+	ti,no-reset-on-init;
+};
+
+&timer7 {
+	ti,no-idle-on-init;
+	ti,no-reset-on-init;
+};
+
+&timer8 {
+	ti,no-idle-on-init;
+	ti,no-reset-on-init;
+};
+
+&mmu_ipu1{
+	ti,no-idle-on-init;
+	ti,no-reset-on-init;
+};
+
+&ipu1 {
+	ti,no-idle-on-init;
+	ti,no-reset-on-init;
+};

+ 1 - 0
arch/arm/include/asm/dma-mapping.h

@@ -14,6 +14,7 @@
 #include <asm/xen/hypervisor.h>
 
 extern const struct dma_map_ops arm_dma_ops;
+extern const struct dma_map_ops arm_dma_m_ops;
 extern const struct dma_map_ops arm_coherent_dma_ops;
 
 static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)

+ 14 - 0
arch/arm/mach-omap2/pdata-quirks.c

@@ -49,7 +49,18 @@ struct pdata_init {
 static struct of_dev_auxdata omap_auxdata_lookup[];
 static struct twl4030_gpio_platform_data twl_gpio_auxdata;
 
+static bool __maybe_unused omap_device_is_enabled(struct platform_device *pdev)
+{
+	struct omap_device *od = to_omap_device(pdev);
+
+	if (od->_state == OMAP_DEVICE_STATE_ENABLED)
+		return true;
+	else
+		return false;
+}
+
 #if IS_ENABLED(CONFIG_OMAP_IOMMU)
+
 int omap_iommu_set_pwrdm_constraint(struct platform_device *pdev, bool request,
 				    u8 *pwrst);
 #else
@@ -436,6 +447,7 @@ static void __init omap3_pandora_legacy_init(void)
 static struct omap_rproc_pdata omap4_ipu_dsp_pdata = {
 	.device_enable = omap_rproc_device_enable,
 	.device_shutdown = omap_rproc_device_shutdown,
+	.device_is_enabled = omap_device_is_enabled,
 };
 #endif
 
@@ -447,6 +459,7 @@ static struct iommu_platform_data omap4_iommu_pdata = {
 	.deassert_reset = omap_device_deassert_hardreset,
 	.device_enable = omap_device_enable,
 	.device_idle = omap_device_idle,
+	.device_is_enabled = omap_device_is_enabled,
 };
 #endif
 
@@ -476,6 +489,7 @@ static struct iommu_platform_data dra7_ipu1_dsp_iommu_pdata = {
 	.assert_reset = omap_device_assert_hardreset,
 	.deassert_reset = omap_device_deassert_hardreset,
 	.device_enable = omap_device_enable,
+	.device_is_enabled = omap_device_is_enabled,
 	.device_idle = omap_device_idle,
 	.set_pwrdm_constraint = omap_iommu_set_pwrdm_constraint,
 };

+ 45 - 8
arch/arm/mm/dma-mapping.c

@@ -50,6 +50,7 @@ struct arm_dma_alloc_args {
 	const void *caller;
 	bool want_vaddr;
 	int coherent_flag;
+	bool zero;
 };
 
 struct arm_dma_free_args {
@@ -203,6 +204,27 @@ const struct dma_map_ops arm_dma_ops = {
 };
 EXPORT_SYMBOL(arm_dma_ops);
 
+static void *arm_dma_malloc(struct device *dev, size_t size, dma_addr_t *handle,
+			    gfp_t gfp, unsigned long dma_attrs);
+
+const struct dma_map_ops arm_dma_m_ops = {
+	.alloc                  = arm_dma_malloc,
+	.free                   = arm_dma_free,
+	.mmap                   = arm_dma_mmap,
+	.get_sgtable            = arm_dma_get_sgtable,
+	.map_page               = arm_dma_map_page,
+	.unmap_page             = arm_dma_unmap_page,
+	.map_sg                 = arm_dma_map_sg,
+	.unmap_sg               = arm_dma_unmap_sg,
+	.sync_single_for_cpu    = arm_dma_sync_single_for_cpu,
+	.sync_single_for_device = arm_dma_sync_single_for_device,
+	.sync_sg_for_cpu        = arm_dma_sync_sg_for_cpu,
+	.sync_sg_for_device     = arm_dma_sync_sg_for_device,
+	.mapping_error		= arm_dma_mapping_error,
+	.dma_supported		= arm_dma_supported,
+};
+EXPORT_SYMBOL(arm_dma_m_ops);
+
 static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
 	dma_addr_t *handle, gfp_t gfp, unsigned long attrs);
 static void arm_coherent_dma_free(struct device *dev, size_t size, void *cpu_addr,
@@ -356,7 +378,7 @@ static void __dma_free_buffer(struct page *page, size_t size)
 static void *__alloc_from_contiguous(struct device *dev, size_t size,
 				     pgprot_t prot, struct page **ret_page,
 				     const void *caller, bool want_vaddr,
-				     int coherent_flag, gfp_t gfp);
+				     int coherent_flag, gfp_t gfp, bool zero);
 
 static void *__alloc_remap_buffer(struct device *dev, size_t size, gfp_t gfp,
 				 pgprot_t prot, struct page **ret_page,
@@ -413,7 +435,7 @@ static int __init atomic_pool_init(void)
 	if (dev_get_cma_area(NULL))
 		ptr = __alloc_from_contiguous(NULL, atomic_pool_size, prot,
 				      &page, atomic_pool_init, true, NORMAL,
-				      GFP_KERNEL);
+				      GFP_KERNEL, true);
 	else
 		ptr = __alloc_remap_buffer(NULL, atomic_pool_size, gfp, prot,
 					   &page, atomic_pool_init, true);
@@ -587,7 +609,7 @@ static int __free_from_pool(void *start, size_t size)
 static void *__alloc_from_contiguous(struct device *dev, size_t size,
 				     pgprot_t prot, struct page **ret_page,
 				     const void *caller, bool want_vaddr,
-				     int coherent_flag, gfp_t gfp)
+				     int coherent_flag, gfp_t gfp, bool zero)
 {
 	unsigned long order = get_order(size);
 	size_t count = size >> PAGE_SHIFT;
@@ -598,7 +620,8 @@ static void *__alloc_from_contiguous(struct device *dev, size_t size,
 	if (!page)
 		return NULL;
 
-	__dma_clear_buffer(page, size, coherent_flag);
+	if (zero)
+		__dma_clear_buffer(page, size, coherent_flag);
 
 	if (!want_vaddr)
 		goto out;
@@ -675,7 +698,7 @@ static void *cma_allocator_alloc(struct arm_dma_alloc_args *args,
 	return __alloc_from_contiguous(args->dev, args->size, args->prot,
 				       ret_page, args->caller,
 				       args->want_vaddr, args->coherent_flag,
-				       args->gfp);
+				       args->gfp, args->zero);
 }
 
 static void cma_allocator_free(struct arm_dma_free_args *args)
@@ -728,7 +751,7 @@ static struct arm_dma_allocator remap_allocator = {
 
 static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 			 gfp_t gfp, pgprot_t prot, bool is_coherent,
-			 unsigned long attrs, const void *caller)
+			 unsigned long attrs, const void *caller, bool zero)
 {
 	u64 mask = get_coherent_dma_mask(dev);
 	struct page *page = NULL;
@@ -743,6 +766,7 @@ static void *__dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 		.caller = caller,
 		.want_vaddr = ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) == 0),
 		.coherent_flag = is_coherent ? COHERENT : NORMAL,
+		.zero = zero,
 	};
 
 #ifdef CONFIG_DMA_API_DEBUG
@@ -816,14 +840,27 @@ void *arm_dma_alloc(struct device *dev, size_t size, dma_addr_t *handle,
 	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
 
 	return __dma_alloc(dev, size, handle, gfp, prot, false,
-			   attrs, __builtin_return_address(0));
+			   attrs, __builtin_return_address(0), true);
+}
+
+/*
+ * Same as arm_dma_alloc except don't zero memory on alloc
+ */
+void *arm_dma_malloc(struct device *dev, size_t size, dma_addr_t *handle,
+		     gfp_t gfp, unsigned long attrs)
+{
+	pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL);
+
+	return __dma_alloc(dev, size, handle, gfp, prot, false,
+			   attrs, __builtin_return_address(0),
+			   false);
 }
 
 static void *arm_coherent_dma_alloc(struct device *dev, size_t size,
 	dma_addr_t *handle, gfp_t gfp, unsigned long attrs)
 {
 	return __dma_alloc(dev, size, handle, gfp, PAGE_KERNEL, true,
-			   attrs, __builtin_return_address(0));
+			   attrs, __builtin_return_address(0), true);
 }
 
 static int __arm_dma_mmap(struct device *dev, struct vm_area_struct *vma,

+ 77 - 5
drivers/clocksource/timer-ti-dm.c

@@ -94,6 +94,13 @@ static void omap_dm_timer_write_reg(struct omap_dm_timer *timer, u32 reg,
 
 static void omap_timer_restore_context(struct omap_dm_timer *timer)
 {
+	/*
+	 * Do not restore the context during late attach. Kernel data
+	 * structure is not in sync with the register settings of the timer.
+	 */
+	if (timer->late_attach)
+		return;
+
 	omap_dm_timer_write_reg(timer, OMAP_TIMER_WAKEUP_EN_REG,
 				timer->context.twer);
 	omap_dm_timer_write_reg(timer, OMAP_TIMER_COUNTER_REG,
@@ -194,6 +201,20 @@ static int omap_dm_timer_set_source(struct omap_dm_timer *timer, int source)
 	return ret;
 }
 
+static int omap_dm_timer_is_enabled(struct omap_dm_timer *timer)
+{
+	u32 val;
+
+	val = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG);
+
+	/* Check if timer ST bit is set or the Counter register is loaded */
+	if (val & OMAP_TIMER_CTRL_ST ||
+	    omap_dm_timer_read_reg(timer, OMAP_TIMER_COUNTER_REG))
+		return 1;
+	else
+		return 0;
+}
+
 static void omap_dm_timer_enable(struct omap_dm_timer *timer)
 {
 	int c;
@@ -247,6 +268,14 @@ static int omap_dm_timer_prepare(struct omap_dm_timer *timer)
 	__omap_dm_timer_enable_posted(timer);
 	omap_dm_timer_disable(timer);
 
+	/*
+	 * During late attach, do not set the timer source during prepare
+	 * as the timer might be clocked from a different source. It will
+	 * be set properly from remoteproc.
+	 */
+	if (timer->late_attach)
+		return 0;
+
 	rc = omap_dm_timer_set_source(timer, OMAP_TIMER_SRC_32_KHZ);
 
 	return rc;
@@ -503,6 +532,16 @@ static int omap_dm_timer_start(struct omap_dm_timer *timer)
 
 	/* Save the context */
 	timer->context.tclr = l;
+
+	/*
+	 * Now that timer has been started, call pm_runtime_put_noidle to
+	 * balance the pm_runtime device usage count to the proper value as
+	 * the regular case, and reset the late_attach flag.
+	 */
+	if (timer->late_attach)
+		pm_runtime_put_noidle(&timer->pdev->dev);
+	timer->late_attach = 0;
+
 	return 0;
 }
 
@@ -543,10 +582,18 @@ static int omap_dm_timer_set_load(struct omap_dm_timer *timer, int autoreload,
 		l |= OMAP_TIMER_CTRL_AR;
 	else
 		l &= ~OMAP_TIMER_CTRL_AR;
-	omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l);
-	omap_dm_timer_write_reg(timer, OMAP_TIMER_LOAD_REG, load);
 
-	omap_dm_timer_write_reg(timer, OMAP_TIMER_TRIGGER_REG, 0);
+	/*
+	 * If late attach is enabled, do not modify the dmtimer registers.
+	 * The registers would have been configured already.
+	 */
+	if (!timer->late_attach) {
+		omap_dm_timer_write_reg(timer, OMAP_TIMER_CTRL_REG, l);
+		omap_dm_timer_write_reg(timer, OMAP_TIMER_LOAD_REG, load);
+
+		omap_dm_timer_write_reg(timer, OMAP_TIMER_TRIGGER_REG, 0);
+	}
+
 	/* Save the context */
 	timer->context.tclr = l;
 	timer->context.tldr = load;
@@ -568,13 +615,21 @@ int omap_dm_timer_set_load_start(struct omap_dm_timer *timer, int autoreload,
 	l = omap_dm_timer_read_reg(timer, OMAP_TIMER_CTRL_REG);
 	if (autoreload) {
 		l |= OMAP_TIMER_CTRL_AR;
-		omap_dm_timer_write_reg(timer, OMAP_TIMER_LOAD_REG, load);
+		/*
+		 * If late attach is enabled, do not modify the dmtimer
+		 * registers. The registers would have been configured
+		 * already.
+		 */
+		if (!timer->late_attach)
+			omap_dm_timer_write_reg(timer, OMAP_TIMER_LOAD_REG,
+						load);
 	} else {
 		l &= ~OMAP_TIMER_CTRL_AR;
 	}
 	l |= OMAP_TIMER_CTRL_ST;
 
-	__omap_dm_timer_load_start(timer, l, load, timer->posted);
+	if (!timer->late_attach)
+		__omap_dm_timer_load_start(timer, l, load, timer->posted);
 
 	/* Save the context */
 	timer->context.tclr = l;
@@ -847,6 +902,16 @@ static int omap_dm_timer_probe(struct platform_device *pdev)
 			goto err_get_sync;
 		}
 		__omap_dm_timer_init_regs(timer);
+
+		if (omap_dm_timer_is_enabled(timer))
+			timer->late_attach = 1;
+		/*
+		 * Increase the pm_runtime usage count and prevent kernel power
+		 * management from idling or disabling the timer.
+		 */
+		if (timer->late_attach)
+			pm_runtime_get_noresume(dev);
+
 		pm_runtime_put(dev);
 	}
 
@@ -884,6 +949,12 @@ static int omap_dm_timer_remove(struct platform_device *pdev)
 		if (!strcmp(dev_name(&timer->pdev->dev),
 			    dev_name(&pdev->dev))) {
 			list_del(&timer->node);
+			/*
+			 * Reset device usage counter if late_attach is still
+			 * set
+			 */
+			if (timer->late_attach)
+				pm_runtime_put_noidle(&timer->pdev->dev);
 			ret = 0;
 			break;
 		}
@@ -905,6 +976,7 @@ const static struct omap_dm_timer_ops dmtimer_ops = {
 	.free = omap_dm_timer_free,
 	.enable = omap_dm_timer_enable,
 	.disable = omap_dm_timer_disable,
+	.is_enabled = omap_dm_timer_is_enabled,
 	.get_fclk = omap_dm_timer_get_fclk,
 	.start = omap_dm_timer_start,
 	.stop = omap_dm_timer_stop,

+ 1 - 1
drivers/iommu/omap-iommu-debug.c

@@ -197,7 +197,7 @@ static void dump_ioptable(struct seq_file *s)
 			continue;
 		}
 
-		iopte = iopte_offset(iopgd, 0);
+		iopte = iopte_get(obj, iopgd, 0);
 		for (j = 0; j < PTRS_PER_IOPTE; j++, iopte++) {
 			if (!*iopte)
 				continue;

+ 77 - 12
drivers/iommu/omap-iommu.c

@@ -44,6 +44,12 @@ static const struct iommu_ops omap_iommu_ops;
 /* bitmap of the page sizes currently supported */
 #define OMAP_IOMMU_PGSIZES	(SZ_4K | SZ_64K | SZ_1M | SZ_16M)
 
+/*
+ * total size of L1 and L2 page tables reserved/used by bootloader per rproc
+ * for early boot usecases, must match the value used in bootloader
+ */
+#define EARLY_PAGE_TABLES_SIZE	SZ_256K
+
 #define MMU_LOCK_BASE_SHIFT	10
 #define MMU_LOCK_BASE_MASK	(0x1f << MMU_LOCK_BASE_SHIFT)
 #define MMU_LOCK_BASE(x)	\
@@ -163,7 +169,7 @@ static int omap2_iommu_enable(struct omap_iommu *obj)
 	if (!obj->iopgd || !IS_ALIGNED((u32)obj->iopgd,  SZ_16K))
 		return -EINVAL;
 
-	pa = virt_to_phys(obj->iopgd);
+	pa = obj->iopgd_pa;
 	if (!IS_ALIGNED(pa, SZ_16K))
 		return -EINVAL;
 
@@ -198,6 +204,15 @@ static int iommu_enable(struct omap_iommu *obj)
 {
 	int ret;
 
+	/*
+	 * now that the threat of idling has passed, decrement the
+	 * device usage count to balance the increment done in probe,
+	 * the pm runtime device usage count will be managed normally
+	 * from here on
+	 */
+	if (obj->late_attach)
+		pm_runtime_put_noidle(obj->dev);
+
 	ret = pm_runtime_get_sync(obj->dev);
 	if (ret < 0)
 		pm_runtime_put_noidle(obj->dev);
@@ -537,7 +552,7 @@ static u32 *iopte_alloc(struct omap_iommu *obj, u32 *iopgd,
 	}
 
 pte_ready:
-	iopte = iopte_offset(iopgd, da);
+	iopte = iopte_get(obj, iopgd, da);
 	*pt_dma = iopgd_page_paddr(iopgd);
 	dev_vdbg(obj->dev,
 		 "%s: da:%08x pgd:%p *pgd:%08x pte:%p *pte:%08x\n",
@@ -696,7 +711,7 @@ iopgtable_lookup_entry(struct omap_iommu *obj, u32 da, u32 **ppgd, u32 **ppte)
 		goto out;
 
 	if (iopgd_is_table(*iopgd))
-		iopte = iopte_offset(iopgd, da);
+		iopte = iopte_get(obj, iopgd, da);
 out:
 	*ppgd = iopgd;
 	*ppte = iopte;
@@ -716,13 +731,13 @@ static size_t iopgtable_clear_entry_core(struct omap_iommu *obj, u32 da)
 
 	if (iopgd_is_table(*iopgd)) {
 		int i;
-		u32 *iopte = iopte_offset(iopgd, da);
+		u32 *iopte = iopte_get(obj, iopgd, da);
 
 		bytes = IOPTE_SIZE;
 		if (*iopte & IOPTE_LARGE) {
 			nent *= 16;
 			/* rewind to the 1st entry */
-			iopte = iopte_offset(iopgd, (da & IOLARGE_MASK));
+			iopte = iopte_get(obj, iopgd, (da & IOLARGE_MASK));
 		}
 		bytes *= nent;
 		memset(iopte, 0, nent * sizeof(*iopte));
@@ -732,7 +747,8 @@ static size_t iopgtable_clear_entry_core(struct omap_iommu *obj, u32 da)
 		/*
 		 * do table walk to check if this table is necessary or not
 		 */
-		iopte = iopte_offset(iopgd, 0);
+		iopte = iopte_get(obj, iopgd, 0);
+
 		for (i = 0; i < PTRS_PER_IOPTE; i++)
 			if (iopte[i])
 				goto out;
@@ -791,8 +807,15 @@ static void iopgtable_clear_entry_all(struct omap_iommu *obj)
 		if (!*iopgd)
 			continue;
 
-		if (iopgd_is_table(*iopgd))
-			iopte_free(obj, iopte_offset(iopgd, 0), true);
+		if (iopgd_is_table(*iopgd)) {
+			if (obj->late_attach)
+				iopte_free(obj, iopte_offset_lateattach(obj,
+									iopgd,
+									0),
+					   true);
+			else
+				iopte_free(obj, iopte_offset(iopgd, 0), true);
+		}
 
 		*iopgd = 0;
 		flush_iopte_range(obj->dev, obj->pd_dma, offset, 1);
@@ -835,7 +858,7 @@ static irqreturn_t iommu_fault_handler(int irq, void *data)
 		return IRQ_NONE;
 	}
 
-	iopte = iopte_offset(iopgd, da);
+	iopte = iopte_get(obj, iopgd, da);
 
 	dev_err(obj->dev, "%s: errs:0x%08x da:0x%08x pgd:0x%p *pgd:0x%08x pte:0x%p *pte:0x%08x\n",
 		obj->name, errs, da, iopgd, *iopgd, iopte, *iopte);
@@ -851,6 +874,16 @@ static irqreturn_t iommu_fault_handler(int irq, void *data)
 static int omap_iommu_attach(struct omap_iommu *obj, u32 *iopgd)
 {
 	int err;
+	u32 iopgd_pa;
+
+	if (obj->late_attach) {
+		iopgd_pa = iommu_read_reg(obj, MMU_TTB);
+		iopgd = ioremap(iopgd_pa, EARLY_PAGE_TABLES_SIZE);
+		if (!iopgd)
+			return -ENOMEM;
+	} else {
+		iopgd_pa = virt_to_phys(iopgd);
+	}
 
 	spin_lock(&obj->iommu_lock);
 
@@ -862,11 +895,14 @@ static int omap_iommu_attach(struct omap_iommu *obj, u32 *iopgd)
 		goto out_err;
 	}
 
+	obj->iopgd_pa = iopgd_pa;
 	obj->iopgd = iopgd;
 	err = iommu_enable(obj);
 	if (err)
 		goto out_err;
-	flush_iotlb_all(obj);
+
+	if (!obj->late_attach)
+		flush_iotlb_all(obj);
 
 	spin_unlock(&obj->iommu_lock);
 
@@ -889,13 +925,19 @@ static void omap_iommu_detach(struct omap_iommu *obj)
 	if (!obj || IS_ERR(obj))
 		return;
 
+	if (obj->late_attach && obj->iopgd)
+		iounmap(obj->iopgd);
+
 	spin_lock(&obj->iommu_lock);
 
 	dma_unmap_single(obj->dev, obj->pd_dma, IOPGD_TABLE_SIZE,
 			 DMA_TO_DEVICE);
 	obj->pd_dma = 0;
+
+	obj->iopgd_pa = 0;
 	obj->iopgd = NULL;
 	iommu_disable(obj);
+	obj->late_attach = 0;
 
 	spin_unlock(&obj->iommu_lock);
 
@@ -1069,7 +1111,9 @@ static int omap_iommu_runtime_resume(struct device *dev)
 		}
 	}
 
-	if (pdata && pdata->deassert_reset) {
+	/* do not deassert reset only during initial boot for late attach */
+	if ((!obj->late_attach || obj->domain) &&
+	    pdata && pdata->deassert_reset) {
 		ret = pdata->deassert_reset(pdev, pdata->reset_name);
 		if (ret) {
 			dev_err(dev, "deassert_reset failed: %d\n", ret);
@@ -1170,6 +1214,7 @@ static int omap_iommu_probe(struct platform_device *pdev)
 	struct omap_iommu *obj;
 	struct resource *res;
 	struct device_node *of = pdev->dev.of_node;
+	struct iommu_platform_data *pdata = dev_get_platdata(&pdev->dev);
 
 	if (!of) {
 		pr_err("%s: only DT-based devices are supported\n", __func__);
@@ -1192,6 +1237,7 @@ static int omap_iommu_probe(struct platform_device *pdev)
 	obj->name = dev_name(&pdev->dev);
 	obj->nr_tlb_entries = 32;
 	err = of_property_read_u32(of, "ti,#tlb-entries", &obj->nr_tlb_entries);
+
 	if (err && err != -EINVAL)
 		return err;
 	if (obj->nr_tlb_entries != 32 && obj->nr_tlb_entries != 8)
@@ -1199,6 +1245,10 @@ static int omap_iommu_probe(struct platform_device *pdev)
 	if (of_find_property(of, "ti,iommu-bus-err-back", NULL))
 		obj->has_bus_err_back = MMU_GP_REG_BUS_ERR_BACK_EN;
 
+	if (pdata && pdata->device_is_enabled &&
+	    pdata->device_is_enabled(pdev))
+		obj->late_attach = 1;
+
 	obj->dev = &pdev->dev;
 	obj->ctx = (void *)obj + sizeof(*obj);
 	obj->cr_ctx = devm_kzalloc(&pdev->dev,
@@ -1247,6 +1297,15 @@ static int omap_iommu_probe(struct platform_device *pdev)
 	}
 
 	pm_runtime_irq_safe(obj->dev);
+
+	/*
+	 * increment the device usage count so that runtime_suspend is not
+	 * invoked immediately after the probe (due to the ti,no-idle-on-init)
+	 * and before any remoteproc has attached to the iommu
+	 */
+	if (obj->late_attach)
+		pm_runtime_get_noresume(obj->dev);
+
 	pm_runtime_enable(obj->dev);
 
 	omap_iommu_debugfs_add(obj);
@@ -1428,6 +1487,11 @@ static int omap_iommu_attach_init(struct device *dev,
 
 	iommu = odomain->iommus;
 	for (i = 0; i < odomain->num_iommus; i++, iommu++) {
+		/*
+		 * not necessary for late attach, the page table would be setup
+		 * by the boot loader. Leaving the below code in place, it does
+		 * not have any side effects during late attach.
+		 */
 		iommu->pgtable = kzalloc(IOPGD_TABLE_SIZE, GFP_ATOMIC);
 		if (!iommu->pgtable)
 			return -ENOMEM;
@@ -1549,7 +1613,8 @@ static void _omap_iommu_detach_dev(struct omap_iommu_domain *omap_domain,
 	arch_data += (omap_domain->num_iommus - 1);
 	for (i = 0; i < omap_domain->num_iommus; i++, iommu--, arch_data--) {
 		oiommu = iommu->iommu_dev;
-		iopgtable_clear_entry_all(oiommu);
+		if (!oiommu->late_attach)
+			iopgtable_clear_entry_all(oiommu);
 
 		omap_iommu_detach(oiommu);
 		iommu->iommu_dev = NULL;

+ 10 - 0
drivers/iommu/omap-iommu.h

@@ -69,6 +69,8 @@ struct omap_iommu {
 	 * but share it globally for each iommu.
 	 */
 	u32		*iopgd;
+	u32		iopgd_pa;
+	u32		late_attach;
 	spinlock_t	page_table_lock; /* protect iopgd */
 	dma_addr_t	pd_dma;
 
@@ -272,4 +274,12 @@ static inline int iotlb_cr_valid(struct cr_regs *cr)
 	return cr->cam & MMU_CAM_V;
 }
 
+static inline u32 *iopte_get(struct omap_iommu *obj, u32 *iopgd, u32 da)
+{
+	if (obj->late_attach)
+		return iopte_offset_lateattach(obj, iopgd, da);
+	else
+		return iopte_offset(iopgd, da);
+}
+
 #endif /* _OMAP_IOMMU_H */

+ 12 - 0
drivers/iommu/omap-iopgtable.h

@@ -99,4 +99,16 @@ static inline phys_addr_t omap_iommu_translate(u32 d, u32 va, u32 mask)
 #define iopte_index(da)		(((da) >> IOPTE_SHIFT) & (PTRS_PER_IOPTE - 1))
 #define iopte_offset(iopgd, da)	(iopgd_page_vaddr(iopgd) + iopte_index(da))
 
+/*
+ * compute vaddr for second-level page table relative to page table directory
+ * for late-attach mode
+ */
+#define iopgd_page_vaddr_lateattach(obj, pgd)				\
+	((u32 *)((u32 *)((obj)->iopgd)) +				\
+	((u32 *)iopgd_page_paddr((pgd)) - (u32 *)((obj)->iopgd_pa)))
+
+/* to find an entry in the second-level page table for late-attach mode */
+#define iopte_offset_lateattach(obj, iopgd, da)				\
+	(iopgd_page_vaddr_lateattach(obj, iopgd) + iopte_index(da))
+
 #endif /* _OMAP_IOPGTABLE_H */

+ 27 - 4
drivers/remoteproc/omap_remoteproc.c

@@ -573,6 +573,10 @@ static int omap_rproc_start(struct rproc *rproc)
 	int ret;
 	struct mbox_client *client = &oproc->client;
 
+	/*
+	 * We set boot address irrespective of the value of the late attach flag
+	 * as boot address takes effect only on a deassert of remoteproc reset.
+	 */
 	if (oproc->boot_data) {
 		ret = omap_rproc_write_dsp_boot_addr(rproc);
 		if (ret)
@@ -612,10 +616,12 @@ static int omap_rproc_start(struct rproc *rproc)
 		goto put_mbox;
 	}
 
-	ret = pdata->device_enable(pdev);
-	if (ret) {
-		dev_err(dev, "omap_device_enable failed: %d\n", ret);
-		goto reset_timers;
+	if (!rproc->late_attach) {
+		ret = pdata->device_enable(pdev);
+		if (ret) {
+			dev_err(dev, "omap_device_enable failed: %d\n", ret);
+			goto reset_timers;
+		}
 	}
 
 	/*
@@ -671,6 +677,16 @@ static int omap_rproc_stop(struct rproc *rproc)
 	if (ret)
 		goto enable_device;
 
+	/*
+	 * During late attach, we use non-zeroing dma ops to prevent the kernel
+	 * from overwriting already loaded code and data segments. When
+	 * shutting down the processor, we restore the normal zeroing dma ops.
+	 * This allows the kernel to clear memory when loading a new remoteproc
+	 * binary or during error recovery with the current remoteproc binary.
+	 */
+	if (rproc->late_attach)
+		set_dma_ops(dev, &arm_dma_ops);
+
 	mbox_free_channel(oproc->mbox);
 
 	/*
@@ -1310,6 +1326,11 @@ static int omap_rproc_probe(struct platform_device *pdev)
 	if (!rproc)
 		return -ENOMEM;
 
+	if (pdata->device_is_enabled && pdata->device_is_enabled(pdev)) {
+		rproc->late_attach = 1;
+		set_dma_ops(&pdev->dev, &arm_dma_m_ops);
+	}
+
 	oproc = rproc->priv;
 	oproc->rproc = rproc;
 	/* All existing OMAP IPU and DSP processors have an MMU */
@@ -1398,6 +1419,8 @@ static int omap_rproc_probe(struct platform_device *pdev)
 release_mem:
 	of_reserved_mem_device_release(&pdev->dev);
 free_rproc:
+	if (rproc->late_attach)
+		set_dma_ops(&pdev->dev, &arm_dma_ops);
 	rproc_free(rproc);
 	return ret;
 }

+ 46 - 20
drivers/remoteproc/remoteproc_core.c

@@ -715,10 +715,13 @@ static int rproc_handle_devmem(struct rproc *rproc, struct fw_rsc_devmem *rsc,
 	if (!mapping)
 		return -ENOMEM;
 
-	ret = iommu_map(rproc->domain, rsc->da, rsc->pa, rsc->len, rsc->flags);
-	if (ret) {
-		dev_err(dev, "failed to map devmem: %d\n", ret);
-		goto out;
+	if (!rproc->late_attach) {
+		ret = iommu_map(rproc->domain, rsc->da, rsc->pa, rsc->len,
+				rsc->flags);
+		if (ret) {
+			dev_err(dev, "failed to map devmem: %d\n", ret);
+			goto out;
+		}
 	}
 
 	/*
@@ -733,8 +736,12 @@ static int rproc_handle_devmem(struct rproc *rproc, struct fw_rsc_devmem *rsc,
 	mapping->len = rsc->len;
 	list_add_tail(&mapping->node, &rproc->mappings);
 
-	dev_dbg(dev, "mapped devmem pa 0x%x, da 0x%x, len 0x%x\n",
-		rsc->pa, rsc->da, rsc->len);
+	if (!rproc->late_attach)
+		dev_dbg(dev, "mapped devmem pa 0x%x, da 0x%x, len 0x%x\n",
+			rsc->pa, rsc->da, rsc->len);
+	else
+		dev_dbg(dev, "late-attach: processed devmem pa 0x%x, da 0x%x, len 0x%x\n",
+			rsc->pa, rsc->da, rsc->len);
 
 	return 0;
 
@@ -789,7 +796,13 @@ static int rproc_handle_carveout(struct rproc *rproc,
 	if (!carveout)
 		return -ENOMEM;
 
-	va = dma_alloc_coherent(dev->parent, rsc->len, &dma, GFP_KERNEL);
+	if (rproc->late_attach) {
+		va = dma_malloc_coherent(dev->parent, rsc->len, &dma,
+					 GFP_KERNEL);
+	} else {
+		va = dma_alloc_coherent(dev->parent, rsc->len, &dma,
+					GFP_KERNEL);
+	}
 	if (!va) {
 		dev_err(dev->parent,
 			"failed to allocate dma memory: len 0x%x\n", rsc->len);
@@ -824,11 +837,13 @@ static int rproc_handle_carveout(struct rproc *rproc,
 			goto dma_free;
 		}
 
-		ret = iommu_map(rproc->domain, rsc->da, dma, rsc->len,
-				rsc->flags);
-		if (ret) {
-			dev_err(dev, "iommu_map failed: %d\n", ret);
-			goto free_mapping;
+		if (!rproc->late_attach) {
+			ret = iommu_map(rproc->domain, rsc->da, dma, rsc->len,
+					rsc->flags);
+			if (ret) {
+				dev_err(dev, "iommu_map failed: %d\n", ret);
+				goto free_mapping;
+			}
 		}
 
 		/*
@@ -842,8 +857,13 @@ static int rproc_handle_carveout(struct rproc *rproc,
 		mapping->len = rsc->len;
 		list_add_tail(&mapping->node, &rproc->mappings);
 
-		dev_dbg(dev, "carveout mapped 0x%x to %pad\n",
-			rsc->da, &dma);
+		if (!rproc->late_attach)
+			dev_dbg(dev, "carveout mapped 0x%x to %pad\n",
+				rsc->da, &dma);
+		else
+			dev_dbg(dev, "late-attach: carveout processed 0x%x to %pad\n",
+				rsc->da, &dma);
+
 	}
 
 	/*
@@ -1116,11 +1136,14 @@ static void rproc_resource_cleanup(struct rproc *rproc)
 	list_for_each_entry_safe(entry, tmp, &rproc->mappings, node) {
 		size_t unmapped;
 
-		unmapped = iommu_unmap(rproc->domain, entry->da, entry->len);
-		if (unmapped != entry->len) {
-			/* nothing much to do besides complaining */
-			dev_err(dev, "failed to unmap %u/%zu\n", entry->len,
-				unmapped);
+		if (!rproc->late_attach) {
+			unmapped = iommu_unmap(rproc->domain, entry->da,
+					       entry->len);
+			if (unmapped != entry->len) {
+				/* nothing much to do besides complaining */
+				dev_err(dev, "failed to unmap %u/%zu\n",
+					entry->len, unmapped);
+			}
 		}
 
 		list_del(&entry->node);
@@ -1199,7 +1222,7 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw)
 		goto clean_up_resources;
 	}
 
-	if (!rproc->skip_load) {
+	if (!rproc->skip_load && !rproc->late_attach) {
 		/* load the ELF segments to memory */
 		ret = rproc_load_segments(rproc, fw);
 		if (ret) {
@@ -1207,6 +1230,8 @@ static int rproc_fw_boot(struct rproc *rproc, const struct firmware *fw)
 				ret);
 			goto clean_up_resources;
 		}
+	} else {
+		dev_dbg(dev, "Skipped program segments load for pre-booted rproc\n");
 	}
 
 	/*
@@ -1658,6 +1683,7 @@ void rproc_shutdown(struct rproc *rproc)
 		complete_all(&rproc->crash_comp);
 
 	rproc->state = RPROC_OFFLINE;
+	rproc->late_attach = 0;
 
 	dev_info(dev, "stopped remote processor %s\n", rproc->name);
 

+ 1 - 0
include/clocksource/timer-ti-dm.h

@@ -116,6 +116,7 @@ struct omap_dm_timer {
 	u32 errata;
 	struct platform_device *pdev;
 	struct list_head node;
+	u32 late_attach;
 };
 
 int omap_dm_timer_reserve_systimer(int id);

+ 20 - 6
include/linux/dma-mapping.h

@@ -161,7 +161,8 @@ static inline int is_device_dma_capable(struct device *dev)
  * Don't use them in device drivers.
  */
 int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
-				       dma_addr_t *dma_handle, void **ret);
+				dma_addr_t *dma_handle, void **ret,
+				bool zero);
 int dma_release_from_dev_coherent(struct device *dev, int order, void *vaddr);
 
 int dma_mmap_from_dev_coherent(struct device *dev, struct vm_area_struct *vma,
@@ -173,7 +174,7 @@ int dma_mmap_from_global_coherent(struct vm_area_struct *vma, void *cpu_addr,
 				  size_t size, int *ret);
 
 #else
-#define dma_alloc_from_dev_coherent(dev, size, handle, ret) (0)
+#define dma_alloc_from_dev_coherent(dev, size, handle, ret, zero) (0)
 #define dma_release_from_dev_coherent(dev, order, vaddr) (0)
 #define dma_mmap_from_dev_coherent(dev, vma, vaddr, order, ret) (0)
 
@@ -505,9 +506,9 @@ dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt, void *cpu_addr,
 #define arch_dma_alloc_attrs(dev)	(true)
 #endif
 
-static inline void *dma_alloc_attrs(struct device *dev, size_t size,
-				       dma_addr_t *dma_handle, gfp_t flag,
-				       unsigned long attrs)
+static inline void *dma_malloc_attrs(struct device *dev, size_t size,
+				     dma_addr_t *dma_handle, gfp_t flag,
+				     unsigned long attrs, bool zero)
 {
 	const struct dma_map_ops *ops = get_dma_ops(dev);
 	void *cpu_addr;
@@ -515,7 +516,7 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 	BUG_ON(!ops);
 	WARN_ON_ONCE(dev && !dev->coherent_dma_mask);
 
-	if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr))
+	if (dma_alloc_from_dev_coherent(dev, size, dma_handle, &cpu_addr, zero))
 		return cpu_addr;
 
 	/* let the implementation decide on the zone to allocate from: */
@@ -531,6 +532,13 @@ static inline void *dma_alloc_attrs(struct device *dev, size_t size,
 	return cpu_addr;
 }
 
+static inline void *dma_alloc_attrs(struct device *dev, size_t size,
+				    dma_addr_t *dma_handle, gfp_t flag,
+				    unsigned long attrs)
+{
+	return dma_malloc_attrs(dev, size, dma_handle, flag, attrs, true);
+}
+
 static inline void dma_free_attrs(struct device *dev, size_t size,
 				     void *cpu_addr, dma_addr_t dma_handle,
 				     unsigned long attrs)
@@ -563,6 +571,12 @@ static inline void *dma_alloc_coherent(struct device *dev, size_t size,
 	return dma_alloc_attrs(dev, size, dma_handle, flag, 0);
 }
 
+static inline void *dma_malloc_coherent(struct device *dev, size_t size,
+					dma_addr_t *dma_handle, gfp_t flag)
+{
+	return dma_malloc_attrs(dev, size, dma_handle, flag, 0, false);
+}
+
 static inline void dma_free_coherent(struct device *dev, size_t size,
 		void *cpu_addr, dma_addr_t dma_handle)
 {

+ 1 - 0
include/linux/platform_data/dmtimer-omap.h

@@ -28,6 +28,7 @@ struct omap_dm_timer_ops {
 	int	(*free)(struct omap_dm_timer *timer);
 
 	void	(*enable)(struct omap_dm_timer *timer);
+	int	(*is_enabled)(struct omap_dm_timer *timer);
 	void	(*disable)(struct omap_dm_timer *timer);
 
 	int	(*get_irq)(struct omap_dm_timer *timer);

+ 1 - 0
include/linux/platform_data/iommu-omap.h

@@ -17,6 +17,7 @@ struct iommu_platform_data {
 	int (*assert_reset)(struct platform_device *pdev, const char *name);
 	int (*deassert_reset)(struct platform_device *pdev, const char *name);
 	int (*device_enable)(struct platform_device *pdev);
+	bool (*device_is_enabled)(struct platform_device *pdev);
 	int (*device_idle)(struct platform_device *pdev);
 	int (*set_pwrdm_constraint)(struct platform_device *pdev, bool request,
 				    u8 *pwrst);

+ 2 - 0
include/linux/platform_data/remoteproc-omap.h

@@ -15,10 +15,12 @@ struct platform_device;
  * struct omap_rproc_pdata - omap remoteproc's platform data
  * @device_enable: omap-specific handler for enabling a device
  * @device_shutdown: omap-specific handler for shutting down a device
+ * @device_is_enabled: omap-specific handler to check if device is booted
  */
 struct omap_rproc_pdata {
 	int (*device_enable)(struct platform_device *pdev);
 	int (*device_shutdown)(struct platform_device *pdev);
+	bool (*device_is_enabled)(struct platform_device *pdev);
 };
 
 #endif /* _PLAT_REMOTEPROC_H */

+ 2 - 0
include/linux/remoteproc.h

@@ -493,6 +493,7 @@ struct rproc_dump_segment {
  * @deny_sysfs_ops: flag to not permit sysfs operations on state and firmware
  * @skip_firmware_request: flag to skip requesting the firmware
  * @skip_load: flag to skip the loading of firmware segments
+ * @late_attach: flag indicating remote core has been externally pre-booted
  * @dump_segments: list of segments in the firmware
  */
 struct rproc {
@@ -531,6 +532,7 @@ struct rproc {
 	unsigned int deny_sysfs_ops		: 1;
 	unsigned int skip_firmware_request	: 1;
 	unsigned int skip_load			: 1;
+	unsigned int late_attach		: 1;
 	struct list_head dump_segments;
 };
 

+ 6 - 5
kernel/dma/coherent.c

@@ -161,7 +161,7 @@ void *dma_mark_declared_memory_occupied(struct device *dev,
 EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
 
 static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
-		ssize_t size, dma_addr_t *dma_handle)
+		ssize_t size, dma_addr_t *dma_handle, bool zero)
 {
 	int order = get_order(size);
 	unsigned long flags;
@@ -183,7 +183,8 @@ static void *__dma_alloc_from_coherent(struct dma_coherent_mem *mem,
 	*dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
 	ret = mem->virt_base + (pageno << PAGE_SHIFT);
 	spin_unlock_irqrestore(&mem->spinlock, flags);
-	memset(ret, 0, size);
+	if (zero)
+		memset(ret, 0, size);
 	return ret;
 err:
 	spin_unlock_irqrestore(&mem->spinlock, flags);
@@ -205,14 +206,14 @@ err:
  * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
  */
 int dma_alloc_from_dev_coherent(struct device *dev, ssize_t size,
-		dma_addr_t *dma_handle, void **ret)
+		dma_addr_t *dma_handle, void **ret, bool zero)
 {
 	struct dma_coherent_mem *mem = dev_get_coherent_memory(dev);
 
 	if (!mem)
 		return 0;
 
-	*ret = __dma_alloc_from_coherent(mem, size, dma_handle);
+	*ret = __dma_alloc_from_coherent(mem, size, dma_handle, zero);
 	if (*ret)
 		return 1;
 
@@ -231,7 +232,7 @@ void *dma_alloc_from_global_coherent(ssize_t size, dma_addr_t *dma_handle)
 		return NULL;
 
 	return __dma_alloc_from_coherent(dma_coherent_default_memory, size,
-			dma_handle);
+			dma_handle, true);
 }
 
 static int __dma_release_from_coherent(struct dma_coherent_mem *mem,