Allow the sys admin to change during run time whether cluster scheduling should be used. The setting can be changed via the sysctl variable /proc/sys/kernel/sched_cluster_enabled Setting it to 1 enable cluster scheduling and setting it to 0 turns it off.
Cluster scheduling should benefit independant tasks by load balancing them between clusters, allowing less contention on cluster resources. However, the extra load balancing overhead may degrade some workload and the default setting is off. --- arch/x86/kernel/smpboot.c | 8 ++++++ drivers/base/arch_topology.c | 7 ++++++ include/linux/sched/sysctl.h | 6 +++++ include/linux/topology.h | 1 + kernel/sched/core.c | 1 + kernel/sched/sched.h | 6 +++++ kernel/sched/topology.c | 47 ++++++++++++++++++++++++++++++++++++ kernel/sysctl.c | 11 +++++++++ 8 files changed, 87 insertions(+)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 3162d0fc6b3c..9c31030bb784 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -57,6 +57,7 @@ #include <linux/pgtable.h> #include <linux/overflow.h> #include <linux/syscore_ops.h> +#include <linux/cpuset.h>
#include <asm/acpi.h> #include <asm/desc.h> @@ -127,6 +128,13 @@ int arch_update_cpu_topology(void) return retval; }
+void arch_rebuild_cpu_topology(void) +{ + x86_topology_update = true; + rebuild_sched_domains(); + x86_topology_update = false; +} + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 0e1070aec26c..756643303dea 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -191,6 +191,13 @@ int topology_update_cpu_topology(void) return update_topology; }
+void __weak arch_rebuild_cpu_topology(void) +{ + update_topology = 1; + rebuild_sched_domains(); + update_topology = 0; +} + /* * Updating the sched_domains can't be done directly from cpufreq callbacks * due to locking, so queue the work for later. diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index db2c0f34aaaf..b8e3a9136875 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -93,6 +93,12 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer, int sysctl_schedstats(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos);
+#ifdef CONFIG_SCHED_CLUSTER +extern unsigned int sysctl_sched_cluster_enabled; +int sched_cluster_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +#endif + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) extern unsigned int sysctl_sched_energy_aware; int sched_energy_aware_handler(struct ctl_table *table, int write, diff --git a/include/linux/topology.h b/include/linux/topology.h index 0b3704ad13c8..42bcfd5d9fdb 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -44,6 +44,7 @@ if (nr_cpus_node(node))
int arch_update_cpu_topology(void); +void arch_rebuild_cpu_topology(void);
/* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5226cc26a095..a180f10f9ffc 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8033,6 +8033,7 @@ int sched_cpu_dying(unsigned int cpu) void __init sched_init_smp(void) { sched_init_numa(); + set_sched_cluster();
/* * There's no userspace yet to cause hotplug operations; hence all the diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a189bec13729..dc80a46f8e93 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1379,6 +1379,12 @@ this_rq_lock_irq(struct rq_flags *rf) return rq; }
+#ifdef CONFIG_SCHED_CLUSTER +extern void set_sched_cluster(void); +#else +static inline void set_sched_cluster(void) { } +#endif + #ifdef CONFIG_NUMA enum numa_topology_type { NUMA_DIRECT, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b019129e515e..90a1f71d9de7 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -9,6 +9,8 @@ DEFINE_MUTEX(sched_domains_mutex); /* Protected by sched_domains_mutex: */ static cpumask_var_t sched_domains_tmpmask; static cpumask_var_t sched_domains_tmpmask2; +/* set via /proc/sys/kernel/sched_cluster_enabled */ +unsigned int __read_mostly sysctl_sched_cluster_enabled;
#ifdef CONFIG_SCHED_DEBUG
@@ -205,6 +207,34 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) return 1; }
+#ifdef CONFIG_SCHED_CLUSTER +void set_sched_cluster(void); + +DEFINE_MUTEX(sched_cluster_mutex); +int sched_cluster_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + unsigned int oldval; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&sched_cluster_mutex); + oldval = sysctl_sched_cluster_enabled; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (!ret && write) { + if (oldval != sysctl_sched_cluster_enabled) { + set_sched_cluster(); + arch_rebuild_cpu_topology(); + } + } + mutex_unlock(&sched_cluster_mutex); + + return ret; +} +#endif + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) DEFINE_STATIC_KEY_FALSE(sched_energy_present); unsigned int sysctl_sched_energy_aware = 1; @@ -1527,6 +1557,23 @@ static struct sched_domain_topology_level default_topology[] = { static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#ifdef CONFIG_SCHED_CLUSTER +void set_sched_cluster(void) +{ + struct sched_domain_topology_level *tl; + + for (tl = sched_domain_topology; tl->mask; tl++) { + if (tl->sd_flags && (tl->sd_flags() & SD_CLUSTER)) { + if (!sysctl_sched_cluster_enabled) + tl->flags |= SDTL_SKIP; + else + tl->flags &= ~SDTL_SKIP; + break; + } + } +} +#endif + static struct sched_domain_topology_level *next_tl(struct sched_domain_topology_level *tl) { ++tl; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d4a78e08f6d8..7cd8f10bf953 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1848,6 +1848,17 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif +#ifdef CONFIG_SCHED_CLUSTER + { + .procname = "sched_cluster_enabled", + .data = &sysctl_sched_cluster_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_cluster_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif #ifdef CONFIG_PROVE_LOCKING { .procname = "prove_locking",