git subrepo pull (merge) linux-tkg

subrepo:
  subdir:   "linux-tkg"
  merged:   "80fe9da3"
upstream:
  origin:   "https://github.com/Frogging-Family/linux-tkg"
  branch:   "master"
  commit:   "61b14da7"
git-subrepo:
  version:  "0.4.3"
  origin:   "???"
  commit:   "???"
This commit is contained in:
Nathan 2022-04-04 21:54:07 -05:00
parent f83940bd29
commit da9b8d972f
23 changed files with 18467 additions and 252 deletions

View File

@ -6,7 +6,7 @@
[subrepo]
remote = https://github.com/Frogging-Family/linux-tkg
branch = master
commit = f671c27f6bc882a2cb559115ddb73f55ccf4f922
commit = 61b14da7960e4cd1a31fb5ee235ec33933fa5645
parent = deeb3f62eca732218713da4a4e131d167bff20ee
method = merge
cmdver = 0.4.3

View File

@ -59,7 +59,7 @@ else
fi
pkgname=("${pkgbase}" "${pkgbase}-headers")
pkgver="${_basekernel}"."${_sub}"
pkgrel=249
pkgrel=252
pkgdesc='Linux-tkg'
arch=('x86_64') # no i686 in here
url="http://www.kernel.org/"
@ -98,6 +98,7 @@ case $_basever in
0009-glitched-bmq.patch
0009-bmq_v5.4-r2.patch
0012-linux-hardened.patch
0012-misc-additions.patch
)
sha256sums=('bf338980b1670bca287f9994b7441c2361907635879169c64ae78364efc5f491'
'cd978b0ca835f0c9fec86f70cdfcc8e103653868e4b44996dc02ab49d6783df0'
@ -120,7 +121,8 @@ case $_basever in
'2d9260b80b43bbd605cf420d6bd53aa7262103dfd77196ba590ece5600b6dc0d'
'3832f828a9f402b153fc9a6829c5a4eaf6091804bcda3a0423c8e1b57e26420d'
'c98befca824f761260466410a1dd94d2b9be6f7211b5daefcfc0f3a102bbdc81'
'aeb31404c26ee898d007b1f66cb9572c9884ad8eca14edc4587d68f6cba6de46')
'aeb31404c26ee898d007b1f66cb9572c9884ad8eca14edc4587d68f6cba6de46'
'87f8ba249620628ad493ed5d65da9811bf635411a70aaa49ed1d97438ebf77be')
;;
57)
source=("$kernel_site"
@ -655,7 +657,7 @@ case $_basever in
#0008-5.14-bcachefs.patch
0009-glitched-ondemand-bmq.patch
0009-glitched-bmq.patch
0009-prjc_v5.16-r0.patch
0009-prjc_v5.16-r1.patch
#0012-linux-hardened.patch
0012-misc-additions.patch
# MM Dirty Soft for WRITE_WATCH support in Wine
@ -679,7 +681,7 @@ case $_basever in
'f91223f98f132602a4fa525917a1f27afe30bdb55a1ac863e739c536188417b3'
'9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177'
'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911'
'7bd99d10ec9f834de95424d033f940f9531beb3a7b4d9711448f0ed66832c03d'
'ccf8d7dc78e92577f826f3e4d76453b1a873d41eb0df15528d117b25925b3f77'
#'decd4a55c0d47b1eb808733490cdfea1207a2022d46f06d04a3cc60fdcb3f32c'
'1aa0a172e1e27fb8171053f3047dcf4a61bd2eda5ea18f02b2bb391741a69887'
'1b656ad96004f27e9dc63d7f430b50d5c48510d6d4cd595a81c24b21adb70313'
@ -716,7 +718,7 @@ case $_basever in
)
sha256sums=('555fef61dddb591a83d62dd04e252792f9af4ba9ef14683f64840e46fa20b1b1'
'24c982d6b7b704b31a6b4bc0a59cbf2135697a58d5d6030532ae07856da36944'
'bd1e57c15d4eb62024d2ee935b54d36e74e73b22c3800b45ecf9233521a9f74b'
'c05ff4ca7a69b94ace73d1019c398f3fd93dfa0ebcc9b2396b56aaa191fc72e0'
'1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898'
'66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997'
'f6383abef027fd9a430fd33415355e0df492cdc3c90e9938bf2d98f4f63b32e6'
@ -731,12 +733,61 @@ case $_basever in
'f91223f98f132602a4fa525917a1f27afe30bdb55a1ac863e739c536188417b3'
'9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177'
'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911'
'5d8aa3d707982e324d3ce8fcc5f832035d8155dc703f0125bbaa21cd87ce26f3'
'95fd78b725360742ead8c55eea7e3ff7ac8ae11ae1a21a0d881e168a841bc9b4'
#'decd4a55c0d47b1eb808733490cdfea1207a2022d46f06d04a3cc60fdcb3f32c'
'1aa0a172e1e27fb8171053f3047dcf4a61bd2eda5ea18f02b2bb391741a69887'
'1b656ad96004f27e9dc63d7f430b50d5c48510d6d4cd595a81c24b21adb70313'
'b0319a7dff9c48b2f3e3d3597ee154bf92223149a633a8b7ce4026252db86da6')
;;
518)
source=("$kernel_site"
#"$patch_site"
'config.x86_64' # stock Arch config
#'config_hardened.x86_64' # hardened Arch config
90-cleanup.hook
cleanup
# ARCH Patches
0001-add-sysctl-to-disallow-unprivileged-CLONE_NEWUSER-by.patch
# TkG
0002-clear-patches.patch
0003-glitched-base.patch
0003-glitched-cfs.patch
0003-glitched-cfs-additions.patch
#0005-glitched-pds.patch
0006-add-acs-overrides_iommu.patch
0007-v5.18-fsync1_via_futex_waitv.patch
0007-v5.18-winesync.patch
#0008-5.14-bcachefs.patch
#0009-glitched-ondemand-bmq.patch
#0009-glitched-bmq.patch
#0009-prjc_v5.18-r0.patch
#0012-linux-hardened.patch
0012-misc-additions.patch
# MM Dirty Soft for WRITE_WATCH support in Wine
0001-mm-Support-soft-dirty-flag-reset-for-VA-range.patch
0002-mm-Support-soft-dirty-flag-read-with-reset.patch
)
sha256sums=('a7ae23d354937723b3ee65513c2707c02541a0553ae9a7d5c7136525335d4423'
#upcoming_kernel_patch_sha256
'c96860d65053cb5accdf1d4f94f4b7bbe46e90a3e869aa3b4cdebc48a68289c1'
'1e15fc2ef3fa770217ecc63a220e5df2ddbcf3295eb4a021171e7edd4c6cc898'
'66a03c246037451a77b4d448565b1d7e9368270c7d02872fbd0b5d024ed0a997'
'f6383abef027fd9a430fd33415355e0df492cdc3c90e9938bf2d98f4f63b32e6'
'35a7cde86fb94939c0f25a62b8c47f3de0dbd3c65f876f460b263181b3e92fc0'
'94eb8f15f4297057c8229bf75c9e08adbaa812f68999bb62dae0e226e37503eb'
'5efd40c392ece498d2d43d5443e6537c2d9ef7cf9820d5ce80b6577fc5d1a4b2'
'e5ea0bb25ee294c655ac3cc30e1eea497799826108fbfb4ef3258c676c1e8a12'
#'fca63d15ca4502aebd73e76d7499b243d2c03db71ff5ab0bf5cf268b2e576320'
'19661ec0d39f9663452b34433214c755179894528bf73a42f6ba52ccf572832a'
'9df628fd530950e37d31da854cb314d536f33c83935adf5c47e71266a55f7004'
'f91223f98f132602a4fa525917a1f27afe30bdb55a1ac863e739c536188417b3'
#'9fad4a40449e09522899955762c8928ae17f4cdaa16e01239fd12592e9d58177'
#'a557b342111849a5f920bbe1c129f3ff1fc1eff62c6bd6685e0972fc88e39911'
#'95fd78b725360742ead8c55eea7e3ff7ac8ae11ae1a21a0d881e168a841bc9b4'
'1aa0a172e1e27fb8171053f3047dcf4a61bd2eda5ea18f02b2bb391741a69887'
'1b656ad96004f27e9dc63d7f430b50d5c48510d6d4cd595a81c24b21adb70313'
'b0319a7dff9c48b2f3e3d3597ee154bf92223149a633a8b7ce4026252db86da6')
;;
esac
export KBUILD_BUILD_HOST=archlinux

View File

@ -5,6 +5,7 @@ This repository provides scripts to automatically download, patch and compile th
### Important information
- **Non-pacman distros support can be considered experimental. You're invited to report issues you might encounter with it.**
- **If your distro isn't using systemd, please set _configfile="running-kernel" in customization.cfg or you might end up with a non-bootable kernel**
- In `intel_pstate` driver, frequency scaling aggressiveness has been changed with kernel 5.5 which results in stutters and poor performance in low/medium load scenarios (for higher power savings). As a workaround for our gaming needs, we are setting it to passive mode to make use of the `acpi_cpufreq` governor passthrough, keeping full support for turbo frequencies. It's combined with our aggressive ondemand governor by default for good performance on most CPUs while keeping frequency scaling for power savings. In a typical low/medium load scenario (Core i7 9700k, playing Mario Galaxy on Dolphin emulator) intel_pstate in performance mode gives a stuttery 45-50 fps experience, while passive mode + aggressive ondemand offers a locked 60 fps.
- Nvidia's proprietary drivers might need to be patched if they don't support your chosen kernel OOTB: [Frogging-Family nvidia-all](https://github.com/Frogging-Family/nvidia-all) can do that automatically for you.
@ -35,7 +36,7 @@ The `customization.cfg` file offers many toggles for extra tweaks:
- `Fsync`, `Futex2` and `Fastsync+winesync` support: can improve the performance in games, needs a patched wine like [wine-tkg](https://github.com/Frogging-Family/wine-tkg-git)
- [Graysky's per-CPU-arch native optimizations](https://github.com/graysky2/kernel_compiler_patch): tunes the compiled code to to a specified CPU
- Compile with GCC or Clang with optional `O2`/`O3` and `LTO` (Clang only) optimizations.
- **Warning regarding DKMS modules and Clang:** `DKMS` will default to using GCC, which will fail to build modules against a Clang-built kernel. This will - for example - break Nvidia drivers. Forcing `DKMS` to use Clang can be done but isn't recommended.
- **Warning regarding DKMS modules prior to v3.0.2 (2021-11-21) and Clang:** `DKMS` version v3.0.1 and earlier will default to using GCC, which will fail to build modules against a Clang-built kernel. This will - for example - break Nvidia drivers. Forcing older `DKMS` to use Clang can be done but isn't recommended.
- Using [Modprobed-db](https://github.com/graysky2/modprobed-db)'s database can reduce the compilation time and produce a smaller kernel which will only contain the modules listed in it. **NOT recommended**
- **Warning**: make sure to read [thoroughly about it first](https://wiki.archlinux.org/index.php/Modprobed-db) since it comes with caveats that can lead to an unbootable kernel.
- "Zenify" patchset using core blk, mm and scheduler tweaks from Zen

View File

@ -2526,8 +2526,8 @@ CONFIG_BLK_DEV_RNBD_SERVER=m
#
# NVME Support
#
CONFIG_NVME_CORE=y
CONFIG_BLK_DEV_NVME=y
CONFIG_NVME_CORE=m
CONFIG_BLK_DEV_NVME=m
CONFIG_NVME_MULTIPATH=y
CONFIG_NVME_HWMON=y
CONFIG_NVME_FABRICS=m

View File

@ -0,0 +1,14 @@
[Trigger]
Type = File
Operation = Install
Operation = Upgrade
Operation = Remove
Target = usr/lib/modules/*/
Target = !usr/lib/modules/*/?*
[Action]
Description = Cleaning up...
When = PostTransaction
Exec = /usr/share/libalpm/scripts/cleanup
NeedsTargets

View File

@ -0,0 +1,10 @@
#!/bin/bash
for _f in /usr/lib/modules/*tkg*; do
if [[ ! -e ${_f}/vmlinuz ]]; then
rm -rf "$_f"
fi
done
# vim:set ft=sh sw=2 et:

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
#!/bin/bash
# List of kernels that are maintained upstream
_current_kernels=("5.17" "5.16" "5.15" "5.10" "5.4")
_current_kernels=("5.18" "5.17" "5.16" "5.15" "5.10" "5.4")
# List of kernels that are no longer maintained upstream
_eol_kernels=("5.14" "5.13" "5.12" "5.11" "5.9" "5.8" "5.7")
@ -20,6 +20,7 @@ _kver_subver_map=(
["5.15"]="32"
["5.16"]="18"
["5.17"]="1"
["5.18"]="rc1"
)
# PREEMPT_RT's supported kernel subversion
@ -517,6 +518,8 @@ _tkg_srcprep() {
rev=3
elif [ "$_basever" = "515" ]; then
rev=1
elif [ "$_basever" = "516" ]; then
rev=1
else
rev=0
fi
@ -803,7 +806,7 @@ _tkg_srcprep() {
_disable "SCHED_BMQ"
elif [ "${_cpusched}" = "cacule" ]; then
_enable "SCHED_AUTOGROUP" "CACULE_SCHED"
_disable "BSD_PROCESS_ACCT" "TASK_XACCT" "CGROUP_CPUACCT" "CGROUP_DEBUG"
_disable "BSD_PROCESS_ACCT" "TASK_XACCT" "CGROUP_CPUACCT" "CGROUP_DEBUG" "CACULE_RDB"
if [ "$_cacule_rdb" = "true" ]; then
_enable "CACULE_RDB"
scripts/config --set-val "RDB_INTERVAL" "$_cacule_rdb_interval"

View File

@ -1,5 +1,5 @@
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2fba82431efb..654a29d94696 100644
index 391b3f9055fe..5d0e76e5a815 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5027,6 +5027,12 @@
@ -176,34 +176,23 @@ index 8874f681b056..59eb72bf7d5f 100644
[RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 78c351e35fec..c6746f5ec3f5 100644
index ee5ed8821963..61ee2514329a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -748,8 +748,14 @@ struct task_struct {
@@ -748,7 +748,12 @@ struct task_struct {
unsigned int ptrace;
#ifdef CONFIG_SMP
- int on_cpu;
struct __call_single_node wake_entry;
+ struct __call_single_node wake_entry;
+#endif
+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT)
+ int on_cpu;
int on_cpu;
+#endif
+
+#ifdef CONFIG_SMP
+#ifndef CONFIG_SCHED_ALT
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT)
struct __call_single_node wake_entry;
unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;
@@ -763,6 +769,7 @@ struct task_struct {
*/
int recent_used_cpu;
int wake_cpu;
+#endif /* !CONFIG_SCHED_ALT */
#endif
int on_rq;
@@ -771,6 +778,20 @@ struct task_struct {
@@ -771,6 +776,20 @@ struct task_struct {
int normal_prio;
unsigned int rt_priority;
@ -224,7 +213,7 @@ index 78c351e35fec..c6746f5ec3f5 100644
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
@@ -781,6 +802,7 @@ struct task_struct {
@@ -781,6 +800,7 @@ struct task_struct {
unsigned long core_cookie;
unsigned int core_occupation;
#endif
@ -232,7 +221,7 @@ index 78c351e35fec..c6746f5ec3f5 100644
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
@@ -1501,6 +1523,15 @@ struct task_struct {
@@ -1501,6 +1521,15 @@ struct task_struct {
*/
};
@ -487,7 +476,7 @@ index ce77f0265660..3cccf8caa1be 100644
This option permits Core Scheduling, a means of coordinated task
selection across SMT siblings. When enabled -- see
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index d0e163a02099..b5276a7a5d82 100644
index df62527f5e0b..556e69cdd44f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -682,7 +682,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
@ -643,10 +632,10 @@ index c7421f2d05e1..9b32442ff2ca 100644
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c
new file mode 100644
index 000000000000..114bd1fd88eb
index 000000000000..83407c4ee806
--- /dev/null
+++ b/kernel/sched/alt_core.c
@@ -0,0 +1,7682 @@
@@ -0,0 +1,7701 @@
+/*
+ * kernel/sched/alt_core.c
+ *
@ -677,7 +666,6 @@ index 000000000000..114bd1fd88eb
+#include <linux/kprobes.h>
+#include <linux/mmu_context.h>
+#include <linux/nmi.h>
+#include <linux/profile.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
@ -716,7 +704,7 @@ index 000000000000..114bd1fd88eb
+#define sched_feat(x) (0)
+#endif /* CONFIG_SCHED_DEBUG */
+
+#define ALT_SCHED_VERSION "v5.15-r1"
+#define ALT_SCHED_VERSION "v5.16-r1"
+
+/* rt_prio(prio) defined in include/linux/sched/rt.h */
+#define rt_task(p) rt_prio((p)->prio)
@ -1355,6 +1343,25 @@ index 000000000000..114bd1fd88eb
+ return task_on_rq_queued(p);
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long ip = 0;
+ unsigned int state;
+
+ if (!p || p == current)
+ return 0;
+
+ /* Only get wchan if task is blocked and we can keep it that way. */
+ raw_spin_lock_irq(&p->pi_lock);
+ state = READ_ONCE(p->__state);
+ smp_rmb(); /* see try_to_wake_up() */
+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
+ ip = __get_wchan(p);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return ip;
+}
+
+/*
+ * Add/Remove/Requeue task to/from the runqueue routines
+ * Context: rq->lock
@ -1396,25 +1403,6 @@ index 000000000000..114bd1fd88eb
+ sched_update_tick_dependency(rq);
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long ip = 0;
+ unsigned int state;
+
+ if (!p || p == current)
+ return 0;
+
+ /* Only get wchan if task is blocked and we can keep it that way. */
+ raw_spin_lock_irq(&p->pi_lock);
+ state = READ_ONCE(p->__state);
+ smp_rmb(); /* see try_to_wake_up() */
+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
+ ip = __get_wchan(p);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return ip;
+}
+
+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags)
+{
+ lockdep_assert_held(&rq->lock);
@ -1982,6 +1970,7 @@ index 000000000000..114bd1fd88eb
+ * per-task data have been completed by this moment.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(task_thread_info(p)->cpu, cpu);
+#endif
+}
@ -2899,9 +2888,10 @@ index 000000000000..114bd1fd88eb
+ rq = this_rq();
+
+#ifdef CONFIG_SMP
+ if (cpu == rq->cpu)
+ if (cpu == rq->cpu) {
+ __schedstat_inc(rq->ttwu_local);
+ else {
+ __schedstat_inc(p->stats.nr_wakeups_local);
+ } else {
+ /** Alt schedule FW ToDo:
+ * How to do ttwu_wake_remote
+ */
@ -2909,6 +2899,7 @@ index 000000000000..114bd1fd88eb
+#endif /* CONFIG_SMP */
+
+ __schedstat_inc(rq->ttwu_count);
+ __schedstat_inc(p->stats.nr_wakeups);
+}
+
+/*
@ -3099,7 +3090,7 @@ index 000000000000..114bd1fd88eb
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (is_idle_task(rq->curr))
+ resched_curr(rq);
+ /* Else CPU is not idle, do nothing here: */
+ /* Else CPU is not idle, do nothing here */
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+out:
@ -3519,9 +3510,9 @@ index 000000000000..114bd1fd88eb
+
+ /*
+ * At this point the task is pinned; either:
+ * - blocked and we're holding off wakeups (pi->lock)
+ * - woken, and we're holding off enqueue (rq->lock)
+ * - queued, and we're holding off schedule (rq->lock)
+ * - blocked and we're holding off wakeups (pi->lock)
+ * - woken, and we're holding off enqueue (rq->lock)
+ * - queued, and we're holding off schedule (rq->lock)
+ * - running, and we're holding off de-schedule (rq->lock)
+ *
+ * The called function (@func) can use: task_curr(), p->on_rq and
@ -3572,6 +3563,11 @@ index 000000000000..114bd1fd88eb
+ p->stime = 0;
+ p->sched_time = 0;
+
+#ifdef CONFIG_SCHEDSTATS
+ /* Even if schedstat is disabled, there should not be garbage */
+ memset(&p->stats, 0, sizeof(p->stats));
+#endif
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
@ -3637,11 +3633,8 @@ index 000000000000..114bd1fd88eb
+ struct rq *rq;
+
+ /*
+ * The child is not yet in the pid-hash so no cgroup attach races,
+ * and the cgroup is pinned to this child due to cgroup_fork()
+ * is ran before sched_fork().
+ *
+ * Silence PROVE_RCU.
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ /*
@ -3676,9 +3669,6 @@ index 000000000000..114bd1fd88eb
+
+void sched_post_fork(struct task_struct *p)
+{
+#ifdef CONFIG_UCLAMP_TASK
+ uclamp_post_fork(p);
+#endif
+}
+
+#ifdef CONFIG_SCHEDSTATS
@ -6859,9 +6849,7 @@ index 000000000000..114bd1fd88eb
+
+ if (spin_needbreak(lock) || resched) {
+ spin_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ spin_lock(lock);
@ -6879,9 +6867,7 @@ index 000000000000..114bd1fd88eb
+
+ if (rwlock_needbreak(lock) || resched) {
+ read_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ read_lock(lock);
@ -6899,9 +6885,7 @@ index 000000000000..114bd1fd88eb
+
+ if (rwlock_needbreak(lock) || resched) {
+ write_unlock(lock);
+ if (resched)
+ preempt_schedule_common();
+ else
+ if (!_cond_resched())
+ cpu_relax();
+ ret = 1;
+ write_lock(lock);
@ -7917,12 +7901,6 @@ index 000000000000..114bd1fd88eb
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+ int nested = preempt_count() + rcu_preempt_depth();
+
+ return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line)
+{
@ -7942,7 +7920,28 @@ index 000000000000..114bd1fd88eb
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void __might_resched(const char *file, int line, int preempt_offset)
+static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
+{
+ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
+ return;
+
+ if (preempt_count() == preempt_offset)
+ return;
+
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, ip);
+}
+
+static inline bool resched_offsets_ok(unsigned int offsets)
+{
+ unsigned int nested = preempt_count();
+
+ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
+
+ return nested == offsets;
+}
+
+void __might_resched(const char *file, int line, unsigned int offsets)
+{
+ /* Ratelimiting timestamp: */
+ static unsigned long prev_jiffy;
@ -7952,7 +7951,7 @@ index 000000000000..114bd1fd88eb
+ /* WARN_ON_ONCE() by default, no rate limit required: */
+ rcu_sleep_check();
+
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
+ !is_idle_task(current) && !current->non_block_count) ||
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
@ -7969,6 +7968,13 @@ index 000000000000..114bd1fd88eb
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
+ current->pid, current->comm);
+ pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
+ offsets & MIGHT_RESCHED_PREEMPT_MASK);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ pr_err("RCU nest depth: %d, expected: %u\n",
+ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
+ }
+
+ if (task_stack_end_corrupted(current))
+ pr_emerg("Thread overran stack, or stack corrupted\n");
@ -7976,12 +7982,10 @@ index 000000000000..114bd1fd88eb
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (!preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
+ }
+#endif
+
+ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,
+ preempt_disable_ip);
+
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
@ -8068,6 +8072,10 @@ index 000000000000..114bd1fd88eb
+ if (p->flags & PF_KTHREAD)
+ continue;
+
+ schedstat_set(p->stats.wait_start, 0);
+ schedstat_set(p->stats.sleep_start, 0);
+ schedstat_set(p->stats.block_start, 0);
+
+ if (!rt_task(p)) {
+ /*
+ * Renice negative nice level userspace
@ -8139,9 +8147,9 @@ index 000000000000..114bd1fd88eb
+ kmem_cache_free(task_group_cache, tg);
+}
+
+static void sched_free_group_rcu(struct rcu_head *rcu)
+static void sched_free_group_rcu(struct rcu_head *rhp)
+{
+ sched_free_group(container_of(rcu, struct task_group, rcu));
+ sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+static void sched_unregister_group(struct task_group *tg)
@ -8172,13 +8180,13 @@ index 000000000000..114bd1fd88eb
+/* rcu callback to free various structures associated with a task group */
+static void sched_unregister_group_rcu(struct rcu_head *rhp)
+{
+ /* Now it should be safe to free those cfs_rqs */
+ /* Now it should be safe to free those cfs_rqs: */
+ sched_unregister_group(container_of(rhp, struct task_group, rcu));
+}
+
+void sched_destroy_group(struct task_group *tg)
+{
+ /* Wait for possible concurrent references to cfs_rqs complete */
+ /* Wait for possible concurrent references to cfs_rqs complete: */
+ call_rcu(&tg->rcu, sched_unregister_group_rcu);
+}
+
@ -8368,10 +8376,10 @@ index 000000000000..1212a031700e
+{}
diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h
new file mode 100644
index 000000000000..e78324687f6e
index 000000000000..f2b9e686d6a6
--- /dev/null
+++ b/kernel/sched/alt_sched.h
@@ -0,0 +1,661 @@
@@ -0,0 +1,667 @@
+#ifndef ALT_SCHED_H
+#define ALT_SCHED_H
+
@ -8405,6 +8413,7 @@ index 000000000000..e78324687f6e
+#include <linux/livepatch.h>
+#include <linux/membarrier.h>
+#include <linux/proc_fs.h>
+#include <linux/profile.h>
+#include <linux/psi.h>
+#include <linux/slab.h>
+#include <linux/stop_machine.h>
@ -8721,6 +8730,11 @@ index 000000000000..e78324687f6e
+}
+#endif
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return READ_ONCE(rq->clock);
+}
+
+static inline u64 rq_clock(struct rq *rq)
+{
+ /*
@ -9199,7 +9213,7 @@ index e7af18857371..3e38816b736e 100644
static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 9392aea1804e..c1ead972e498 100644
index b7ec42732b28..a855594a540f 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -123,7 +123,7 @@ void account_user_time(struct task_struct *p, u64 cputime)
@ -9495,7 +9509,7 @@ index a554e3bbab2b..3e56f5e6ff5c 100644
* thermal:
*
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index e06071bf3472..adf567df34d4 100644
index c336f5f481bc..5865f14714a9 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -1,13 +1,15 @@
@ -9515,15 +9529,15 @@ index e06071bf3472..adf567df34d4 100644
int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity);
static inline u64 thermal_load_avg(struct rq *rq)
@@ -42,6 +44,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
return LOAD_AVG_MAX - 1024 + avg->period_contrib;
@@ -44,6 +46,7 @@ static inline u32 get_pelt_divider(struct sched_avg *avg)
return PELT_MIN_DIVIDER + avg->period_contrib;
}
+#ifndef CONFIG_SCHED_ALT
static inline void cfs_se_util_change(struct sched_avg *avg)
{
unsigned int enqueued;
@@ -153,9 +156,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
@@ -155,9 +158,11 @@ static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
return rq_clock_pelt(rq_of(cfs_rq));
}
#endif
@ -9535,7 +9549,7 @@ index e06071bf3472..adf567df34d4 100644
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
@@ -173,6 +178,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
@@ -175,6 +180,7 @@ update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
{
return 0;
}
@ -9571,22 +9585,6 @@ diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 07dde2928c79..6a6edc730dce 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -4,6 +4,7 @@
*/
#include "sched.h"
+#ifndef CONFIG_SCHED_ALT
void __update_stats_wait_start(struct rq *rq, struct task_struct *p,
struct sched_statistics *stats)
{
@@ -90,6 +90,7 @@
}
}
+#endif
/*
* Current schedstat API version.
*
@@ -126,8 +126,10 @@ static int show_schedstat(struct seq_file *seq, void *v)
} else {
struct rq *rq;
@ -9615,27 +9613,27 @@ index 07dde2928c79..6a6edc730dce 100644
}
return 0;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index cfb0893a83d4..4fb593535447 100644
index 3a3c826dd83a..d80520eca556 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -94,6 +94,7 @@ struct sched_entity_stats {
} __no_randomize_layout;
#endif
@@ -87,6 +87,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
#endif /* CONFIG_SCHEDSTATS */
+#ifndef CONFIG_SCHED_ALT
static inline struct sched_statistics *
__schedstats_from_se(struct sched_entity *se)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity_stats {
struct sched_entity se;
@@ -103,6 +104,7 @@ __schedstats_from_se(struct sched_entity *se)
#endif
return &task_of(se)->stats;
}
+#endif
+#endif /* CONFIG_SCHED_ALT */
#ifdef CONFIG_PSI
/*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a7052a29..163cec668095 100644
index d201a7052a29..e5a7a638f3fb 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
@ -9673,7 +9671,7 @@ index d201a7052a29..163cec668095 100644
#ifdef CONFIG_NUMA
static const struct cpumask *sd_numa_mask(int cpu)
@@ -2531,3 +2536,15 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
@@ -2531,3 +2536,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
mutex_unlock(&sched_domains_mutex);
}
@ -9683,6 +9681,8 @@ index d201a7052a29..163cec668095 100644
+{}
+
+#ifdef CONFIG_NUMA
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ return best_mask_cpu(cpu, cpus);
@ -9690,7 +9690,7 @@ index d201a7052a29..163cec668095 100644
+#endif /* CONFIG_NUMA */
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 083be6af29d7..09fc6281d488 100644
index 0586047f7323..e4bc1eacd184 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -122,6 +122,10 @@ static unsigned long long_max = LONG_MAX;
@ -9704,7 +9704,7 @@ index 083be6af29d7..09fc6281d488 100644
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -1771,6 +1775,24 @@ int proc_do_static_key(struct ctl_table *table, int write,
@@ -1778,6 +1782,24 @@ int proc_do_static_key(struct ctl_table *table, int write,
}
static struct ctl_table kern_table[] = {
@ -9729,7 +9729,7 @@ index 083be6af29d7..09fc6281d488 100644
{
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
@@ -1901,6 +1923,7 @@ static struct ctl_table kern_table[] = {
@@ -1908,6 +1930,7 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
@ -9737,7 +9737,7 @@ index 083be6af29d7..09fc6281d488 100644
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -2477,6 +2500,17 @@ static struct ctl_table kern_table[] = {
@@ -2484,6 +2507,17 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
@ -9831,10 +9831,10 @@ index 96b4e7810426..83457e8bb5d2 100644
return false;
}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index afd937a46496..7fac2e43d668 100644
index abcadbe933bb..d4c778b0ab0e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1142,10 +1142,15 @@ static int trace_wakeup_test_thread(void *data)
@@ -1140,10 +1140,15 @@ static int trace_wakeup_test_thread(void *data)
{
/* Make this a -deadline thread */
static const struct sched_attr attr = {

View File

@ -1,5 +1,5 @@
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f5a27f067db9..90c934ec13cc 100644
index 7123524a86b8..c9878f85c176 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5085,6 +5085,12 @@
@ -176,34 +176,23 @@ index 8874f681b056..59eb72bf7d5f 100644
[RLIMIT_RTTIME] = { RLIM_INFINITY, RLIM_INFINITY }, \
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75ba8aa60248..3de388cb6923 100644
index 75ba8aa60248..6da339d69619 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -753,8 +753,14 @@ struct task_struct {
@@ -753,7 +753,12 @@ struct task_struct {
unsigned int ptrace;
#ifdef CONFIG_SMP
- int on_cpu;
struct __call_single_node wake_entry;
+ struct __call_single_node wake_entry;
+#endif
+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_ALT)
+ int on_cpu;
int on_cpu;
+#endif
+
+#ifdef CONFIG_SMP
+#ifndef CONFIG_SCHED_ALT
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_ALT)
struct __call_single_node wake_entry;
unsigned int wakee_flips;
unsigned long wakee_flip_decay_ts;
struct task_struct *last_wakee;
@@ -768,6 +774,7 @@ struct task_struct {
*/
int recent_used_cpu;
int wake_cpu;
+#endif /* !CONFIG_SCHED_ALT */
#endif
int on_rq;
@@ -776,6 +783,20 @@ struct task_struct {
@@ -776,6 +781,20 @@ struct task_struct {
int normal_prio;
unsigned int rt_priority;
@ -224,7 +213,7 @@ index 75ba8aa60248..3de388cb6923 100644
struct sched_entity se;
struct sched_rt_entity rt;
struct sched_dl_entity dl;
@@ -786,6 +807,7 @@ struct task_struct {
@@ -786,6 +805,7 @@ struct task_struct {
unsigned long core_cookie;
unsigned int core_occupation;
#endif
@ -232,7 +221,7 @@ index 75ba8aa60248..3de388cb6923 100644
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
@@ -1509,6 +1531,15 @@ struct task_struct {
@@ -1509,6 +1529,15 @@ struct task_struct {
*/
};
@ -352,20 +341,12 @@ index 8054641c0a7b..284687d47059 100644
#else
static inline void rebuild_sched_domains_energy(void)
diff --git a/init/Kconfig b/init/Kconfig
index e9119bf54b1f..2213c306065e 100644
index e9119bf54b1f..6be3308a3665 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -817,6 +817,7 @@ menu "Scheduler features"
config UCLAMP_TASK
bool "Enable utilization clamping for RT/FAIR tasks"
depends on CPU_FREQ_GOV_SCHEDUTIL
+ depends on !SCHED_ALT
help
This feature enables the scheduler to track the clamped utilization
of each CPU based on RUNNABLE tasks scheduled on that CPU.
@@ -863,6 +864,35 @@ config UCLAMP_BUCKETS_COUNT
@@ -814,9 +814,39 @@ config GENERIC_SCHED_CLOCK
If in doubt, use the default value.
menu "Scheduler features"
+menuconfig SCHED_ALT
+ bool "Alternative CPU Schedulers"
@ -396,9 +377,13 @@ index e9119bf54b1f..2213c306065e 100644
+
+endif
+
endmenu
#
config UCLAMP_TASK
bool "Enable utilization clamping for RT/FAIR tasks"
depends on CPU_FREQ_GOV_SCHEDUTIL
+ depends on !SCHED_ALT
help
This feature enables the scheduler to track the clamped utilization
of each CPU based on RUNNABLE tasks scheduled on that CPU.
@@ -907,6 +937,7 @@ config NUMA_BALANCING
depends on ARCH_SUPPORTS_NUMA_BALANCING
depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
@ -647,10 +632,10 @@ index c83b37af155b..c88e9aab0cb3 100644
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/alt_core.c b/kernel/sched/alt_core.c
new file mode 100644
index 000000000000..c52650a6e72e
index 000000000000..6338a97b429e
--- /dev/null
+++ b/kernel/sched/alt_core.c
@@ -0,0 +1,7680 @@
@@ -0,0 +1,7704 @@
+/*
+ * kernel/sched/alt_core.c
+ *
@ -681,7 +666,6 @@ index 000000000000..c52650a6e72e
+#include <linux/kprobes.h>
+#include <linux/mmu_context.h>
+#include <linux/nmi.h>
+#include <linux/profile.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
@ -720,7 +704,7 @@ index 000000000000..c52650a6e72e
+#define sched_feat(x) (0)
+#endif /* CONFIG_SCHED_DEBUG */
+
+#define ALT_SCHED_VERSION "v5.15-r1"
+#define ALT_SCHED_VERSION "v5.17-r0"
+
+/* rt_prio(prio) defined in include/linux/sched/rt.h */
+#define rt_task(p) rt_prio((p)->prio)
@ -1359,6 +1343,25 @@ index 000000000000..c52650a6e72e
+ return task_on_rq_queued(p);
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long ip = 0;
+ unsigned int state;
+
+ if (!p || p == current)
+ return 0;
+
+ /* Only get wchan if task is blocked and we can keep it that way. */
+ raw_spin_lock_irq(&p->pi_lock);
+ state = READ_ONCE(p->__state);
+ smp_rmb(); /* see try_to_wake_up() */
+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
+ ip = __get_wchan(p);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return ip;
+}
+
+/*
+ * Add/Remove/Requeue task to/from the runqueue routines
+ * Context: rq->lock
@ -1400,25 +1403,6 @@ index 000000000000..c52650a6e72e
+ sched_update_tick_dependency(rq);
+}
+
+unsigned long get_wchan(struct task_struct *p)
+{
+ unsigned long ip = 0;
+ unsigned int state;
+
+ if (!p || p == current)
+ return 0;
+
+ /* Only get wchan if task is blocked and we can keep it that way. */
+ raw_spin_lock_irq(&p->pi_lock);
+ state = READ_ONCE(p->__state);
+ smp_rmb(); /* see try_to_wake_up() */
+ if (state != TASK_RUNNING && state != TASK_WAKING && !p->on_rq)
+ ip = __get_wchan(p);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ return ip;
+}
+
+static inline void enqueue_task(struct task_struct *p, struct rq *rq, int flags)
+{
+ lockdep_assert_held(&rq->lock);
@ -1986,6 +1970,7 @@ index 000000000000..c52650a6e72e
+ * per-task data have been completed by this moment.
+ */
+ smp_wmb();
+
+ WRITE_ONCE(task_thread_info(p)->cpu, cpu);
+#endif
+}
@ -2094,6 +2079,9 @@ index 000000000000..c52650a6e72e
+{
+ struct task_struct *p = current;
+
+ if (0 == p->migration_disabled)
+ return;
+
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
+ return;
@ -2903,9 +2891,10 @@ index 000000000000..c52650a6e72e
+ rq = this_rq();
+
+#ifdef CONFIG_SMP
+ if (cpu == rq->cpu)
+ if (cpu == rq->cpu) {
+ __schedstat_inc(rq->ttwu_local);
+ else {
+ __schedstat_inc(p->stats.nr_wakeups_local);
+ } else {
+ /** Alt schedule FW ToDo:
+ * How to do ttwu_wake_remote
+ */
@ -2913,6 +2902,7 @@ index 000000000000..c52650a6e72e
+#endif /* CONFIG_SMP */
+
+ __schedstat_inc(rq->ttwu_count);
+ __schedstat_inc(p->stats.nr_wakeups);
+}
+
+/*
@ -3103,7 +3093,7 @@ index 000000000000..c52650a6e72e
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ if (is_idle_task(rq->curr))
+ resched_curr(rq);
+ /* Else CPU is not idle, do nothing here: */
+ /* Else CPU is not idle, do nothing here */
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+out:
@ -3523,9 +3513,9 @@ index 000000000000..c52650a6e72e
+
+ /*
+ * At this point the task is pinned; either:
+ * - blocked and we're holding off wakeups (pi->lock)
+ * - woken, and we're holding off enqueue (rq->lock)
+ * - queued, and we're holding off schedule (rq->lock)
+ * - blocked and we're holding off wakeups (pi->lock)
+ * - woken, and we're holding off enqueue (rq->lock)
+ * - queued, and we're holding off schedule (rq->lock)
+ * - running, and we're holding off de-schedule (rq->lock)
+ *
+ * The called function (@func) can use: task_curr(), p->on_rq and
@ -3576,6 +3566,11 @@ index 000000000000..c52650a6e72e
+ p->stime = 0;
+ p->sched_time = 0;
+
+#ifdef CONFIG_SCHEDSTATS
+ /* Even if schedstat is disabled, there should not be garbage */
+ memset(&p->stats, 0, sizeof(p->stats));
+#endif
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+ INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
@ -3630,9 +3625,6 @@ index 000000000000..c52650a6e72e
+ if (unlikely(sched_info_on()))
+ memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+#if defined(CONFIG_SMP)
+ p->on_cpu = 0;
+#endif
+ init_task_preempt_count(p);
+
+ return 0;
@ -3644,11 +3636,8 @@ index 000000000000..c52650a6e72e
+ struct rq *rq;
+
+ /*
+ * The child is not yet in the pid-hash so no cgroup attach races,
+ * and the cgroup is pinned to this child due to cgroup_fork()
+ * is ran before sched_fork().
+ *
+ * Silence PROVE_RCU.
+ * Because we're not yet on the pid-hash, p->pi_lock isn't strictly
+ * required yet, but lockdep gets upset if rules are violated.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ /*
@ -3683,9 +3672,6 @@ index 000000000000..c52650a6e72e
+
+void sched_post_fork(struct task_struct *p)
+{
+#ifdef CONFIG_UCLAMP_TASK
+ uclamp_post_fork(p);
+#endif
+}
+
+#ifdef CONFIG_SCHEDSTATS
@ -7266,7 +7252,6 @@ index 000000000000..c52650a6e72e
+
+ rq->idle = idle;
+ rcu_assign_pointer(rq->curr, idle);
+ idle->on_rq = TASK_ON_RQ_QUEUED;
+ idle->on_cpu = 1;
+
+ raw_spin_unlock(&rq->lock);
@ -7919,12 +7904,6 @@ index 000000000000..c52650a6e72e
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+ int nested = preempt_count() + rcu_preempt_depth();
+
+ return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line)
+{
@ -7944,7 +7923,28 @@ index 000000000000..c52650a6e72e
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void __might_resched(const char *file, int line, int preempt_offset)
+static void print_preempt_disable_ip(int preempt_offset, unsigned long ip)
+{
+ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT))
+ return;
+
+ if (preempt_count() == preempt_offset)
+ return;
+
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, ip);
+}
+
+static inline bool resched_offsets_ok(unsigned int offsets)
+{
+ unsigned int nested = preempt_count();
+
+ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT;
+
+ return nested == offsets;
+}
+
+void __might_resched(const char *file, int line, unsigned int offsets)
+{
+ /* Ratelimiting timestamp: */
+ static unsigned long prev_jiffy;
@ -7954,7 +7954,7 @@ index 000000000000..c52650a6e72e
+ /* WARN_ON_ONCE() by default, no rate limit required: */
+ rcu_sleep_check();
+
+ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+ if ((resched_offsets_ok(offsets) && !irqs_disabled() &&
+ !is_idle_task(current) && !current->non_block_count) ||
+ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
+ oops_in_progress)
@ -7971,6 +7971,13 @@ index 000000000000..c52650a6e72e
+ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n",
+ in_atomic(), irqs_disabled(), current->non_block_count,
+ current->pid, current->comm);
+ pr_err("preempt_count: %x, expected: %x\n", preempt_count(),
+ offsets & MIGHT_RESCHED_PREEMPT_MASK);
+
+ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) {
+ pr_err("RCU nest depth: %d, expected: %u\n",
+ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT);
+ }
+
+ if (task_stack_end_corrupted(current))
+ pr_emerg("Thread overran stack, or stack corrupted\n");
@ -7978,12 +7985,10 @@ index 000000000000..c52650a6e72e
+ debug_show_held_locks(current);
+ if (irqs_disabled())
+ print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+ if (!preempt_count_equals(preempt_offset)) {
+ pr_err("Preemption disabled at:");
+ print_ip_sym(KERN_ERR, preempt_disable_ip);
+ }
+#endif
+
+ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK,
+ preempt_disable_ip);
+
+ dump_stack();
+ add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
@ -8070,6 +8075,10 @@ index 000000000000..c52650a6e72e
+ if (p->flags & PF_KTHREAD)
+ continue;
+
+ schedstat_set(p->stats.wait_start, 0);
+ schedstat_set(p->stats.sleep_start, 0);
+ schedstat_set(p->stats.block_start, 0);
+
+ if (!rt_task(p)) {
+ /*
+ * Renice negative nice level userspace
@ -8141,9 +8150,9 @@ index 000000000000..c52650a6e72e
+ kmem_cache_free(task_group_cache, tg);
+}
+
+static void sched_free_group_rcu(struct rcu_head *rcu)
+static void sched_free_group_rcu(struct rcu_head *rhp)
+{
+ sched_free_group(container_of(rcu, struct task_group, rcu));
+ sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+static void sched_unregister_group(struct task_group *tg)
@ -8174,13 +8183,13 @@ index 000000000000..c52650a6e72e
+/* rcu callback to free various structures associated with a task group */
+static void sched_unregister_group_rcu(struct rcu_head *rhp)
+{
+ /* Now it should be safe to free those cfs_rqs */
+ /* Now it should be safe to free those cfs_rqs: */
+ sched_unregister_group(container_of(rhp, struct task_group, rcu));
+}
+
+void sched_destroy_group(struct task_group *tg)
+{
+ /* Wait for possible concurrent references to cfs_rqs complete */
+ /* Wait for possible concurrent references to cfs_rqs complete: */
+ call_rcu(&tg->rcu, sched_unregister_group_rcu);
+}
+
@ -8370,10 +8379,10 @@ index 000000000000..1212a031700e
+{}
diff --git a/kernel/sched/alt_sched.h b/kernel/sched/alt_sched.h
new file mode 100644
index 000000000000..6ff979a299ab
index 000000000000..f2b9e686d6a6
--- /dev/null
+++ b/kernel/sched/alt_sched.h
@@ -0,0 +1,662 @@
@@ -0,0 +1,667 @@
+#ifndef ALT_SCHED_H
+#define ALT_SCHED_H
+
@ -8724,6 +8733,11 @@ index 000000000000..6ff979a299ab
+}
+#endif
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+ return READ_ONCE(rq->clock);
+}
+
+static inline u64 rq_clock(struct rq *rq)
+{
+ /*
@ -9602,27 +9616,27 @@ index 07dde2928c79..6a6edc730dce 100644
}
return 0;
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 3a3c826dd83a..39df2b235944 100644
index 3a3c826dd83a..d80520eca556 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -94,6 +94,7 @@ struct sched_entity_stats {
} __no_randomize_layout;
#endif
@@ -87,6 +87,7 @@ static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delt
#endif /* CONFIG_SCHEDSTATS */
+#ifndef CONFIG_SCHED_ALT
static inline struct sched_statistics *
__schedstats_from_se(struct sched_entity *se)
{
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity_stats {
struct sched_entity se;
@@ -103,6 +104,7 @@ __schedstats_from_se(struct sched_entity *se)
#endif
return &task_of(se)->stats;
}
+#endif
+#endif /* CONFIG_SCHED_ALT */
#ifdef CONFIG_PSI
/*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index d201a7052a29..163cec668095 100644
index d201a7052a29..e5a7a638f3fb 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
@ -9660,7 +9674,7 @@ index d201a7052a29..163cec668095 100644
#ifdef CONFIG_NUMA
static const struct cpumask *sd_numa_mask(int cpu)
@@ -2531,3 +2536,15 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
@@ -2531,3 +2536,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
mutex_unlock(&sched_domains_mutex);
}
@ -9670,6 +9684,8 @@ index d201a7052a29..163cec668095 100644
+{}
+
+#ifdef CONFIG_NUMA
+int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+
+int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
+{
+ return best_mask_cpu(cpu, cpus);
@ -9677,21 +9693,21 @@ index d201a7052a29..163cec668095 100644
+#endif /* CONFIG_NUMA */
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ae443b2882e..7bb4e033cae6 100644
index 730ab56d9e92..f2fdf9088055 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -94,6 +94,10 @@
@@ -96,6 +96,10 @@
#if defined(CONFIG_SYSCTL)
/* Constants used for minimum and maximum */
+#ifdef CONFIG_SCHED_ALT
+extern int sched_yield_type;
+#endif
+
#ifdef CONFIG_USER_NS
extern int unprivileged_userns_clone;
#ifdef CONFIG_PERF_EVENTS
static const int six_hundred_forty_kb = 640 * 1024;
#endif
@@ -1652,6 +1656,24 @@ int proc_do_static_key(struct ctl_table *table, int write,
@@ -1659,6 +1663,24 @@ int proc_do_static_key(struct ctl_table *table, int write,
}
static struct ctl_table kern_table[] = {
@ -9716,7 +9732,7 @@ index 5ae443b2882e..7bb4e033cae6 100644
{
.procname = "sched_child_runs_first",
.data = &sysctl_sched_child_runs_first,
@@ -1782,6 +1804,7 @@ static struct ctl_table kern_table[] = {
@@ -1789,6 +1811,7 @@ static struct ctl_table kern_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
@ -9724,7 +9740,7 @@ index 5ae443b2882e..7bb4e033cae6 100644
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -2167,6 +2190,17 @@ static struct ctl_table kern_table[] = {
@@ -2174,6 +2197,17 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
@ -9837,3 +9853,92 @@ index abcadbe933bb..d4c778b0ab0e 100644
};
struct wakeup_test_data *x = data;
diff --git a/init/Kconfig b/init/Kconfig
index 6be3308a3665af9e932db6639e4e22adec1dd9c9..2213c306065ea9e46061da4ad3c901183ee13f78 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -814,35 +814,6 @@ config GENERIC_SCHED_CLOCK
menu "Scheduler features"
-menuconfig SCHED_ALT
- bool "Alternative CPU Schedulers"
- default y
- help
- This feature enable alternative CPU scheduler"
-
-if SCHED_ALT
-
-choice
- prompt "Alternative CPU Scheduler"
- default SCHED_BMQ
-
-config SCHED_BMQ
- bool "BMQ CPU scheduler"
- help
- The BitMap Queue CPU scheduler for excellent interactivity and
- responsiveness on the desktop and solid scalability on normal
- hardware and commodity servers.
-
-config SCHED_PDS
- bool "PDS CPU scheduler"
- help
- The Priority and Deadline based Skip list multiple queue CPU
- Scheduler.
-
-endchoice
-
-endif
-
config UCLAMP_TASK
bool "Enable utilization clamping for RT/FAIR tasks"
depends on CPU_FREQ_GOV_SCHEDUTIL
@@ -893,6 +864,35 @@ config UCLAMP_BUCKETS_COUNT
If in doubt, use the default value.
+menuconfig SCHED_ALT
+ bool "Alternative CPU Schedulers"
+ default y
+ help
+ This feature enable alternative CPU scheduler"
+
+if SCHED_ALT
+
+choice
+ prompt "Alternative CPU Scheduler"
+ default SCHED_BMQ
+
+config SCHED_BMQ
+ bool "BMQ CPU scheduler"
+ help
+ The BitMap Queue CPU scheduler for excellent interactivity and
+ responsiveness on the desktop and solid scalability on normal
+ hardware and commodity servers.
+
+config SCHED_PDS
+ bool "PDS CPU scheduler"
+ help
+ The Priority and Deadline based Skip list multiple queue CPU
+ Scheduler.
+
+endchoice
+
+endif
+
endmenu
#
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index e5a7a638f3fbe0f40f9b0be709ccbd02039d6d3b..163cec668095133a3f1d34df462cc7b8260504be 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2542,8 +2542,6 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
{}
#ifdef CONFIG_NUMA
-int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
-
int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
{
return best_mask_cpu(cpu, cpus);

View File

@ -0,0 +1,156 @@
From 5ec2dd3a095442ec1a21d86042a4994f2ba24e63 Mon Sep 17 00:00:00 2001
Message-Id: <5ec2dd3a095442ec1a21d86042a4994f2ba24e63.1512651251.git.jan.steffens@gmail.com>
From: Serge Hallyn <serge.hallyn@canonical.com>
Date: Fri, 31 May 2013 19:12:12 +0100
Subject: [PATCH] add sysctl to disallow unprivileged CLONE_NEWUSER by default
Signed-off-by: Serge Hallyn <serge.hallyn@ubuntu.com>
[bwh: Remove unneeded binary sysctl bits]
Signed-off-by: Daniel Micay <danielmicay@gmail.com>
---
kernel/fork.c | 15 +++++++++++++++
kernel/sysctl.c | 12 ++++++++++++
kernel/user_namespace.c | 3 +++
3 files changed, 30 insertions(+)
diff --git a/kernel/fork.c b/kernel/fork.c
index 07cc743698d3668e..4011d68a8ff9305c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -102,6 +102,11 @@
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>
+#ifdef CONFIG_USER_NS
+extern int unprivileged_userns_clone;
+#else
+#define unprivileged_userns_clone 0
+#endif
/*
* Minimum number of threads to boot the kernel
@@ -1555,6 +1560,10 @@ static __latent_entropy struct task_struct *copy_process(
if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
return ERR_PTR(-EINVAL);
+ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone)
+ if (!capable(CAP_SYS_ADMIN))
+ return ERR_PTR(-EPERM);
+
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
+ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) {
+ err = -EPERM;
+ if (!capable(CAP_SYS_ADMIN))
+ goto bad_unshare_out;
+ }
+
err = check_unshare_flags(unshare_flags);
if (err)
goto bad_unshare_out;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b86520ed3fb60fbf..f7dab3760839f1a1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -105,6 +105,9 @@ extern int core_uses_pid;
#if defined(CONFIG_SYSCTL)
+#ifdef CONFIG_USER_NS
+extern int unprivileged_userns_clone;
+#endif
/* Constants used for minimum and maximum */
#ifdef CONFIG_LOCKUP_DETECTOR
static int sixty = 60;
@@ -513,6 +516,15 @@ static struct ctl_table kern_table[] = {
.proc_handler = proc_dointvec,
},
#endif
+#ifdef CONFIG_USER_NS
+ {
+ .procname = "unprivileged_userns_clone",
+ .data = &unprivileged_userns_clone,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
#ifdef CONFIG_PROC_SYSCTL
{
.procname = "tainted",
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index c490f1e4313b998a..dd03bd39d7bf194d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -24,6 +24,9 @@
#include <linux/projid.h>
#include <linux/fs_struct.h>
+/* sysctl */
+int unprivileged_userns_clone;
+
static struct kmem_cache *user_ns_cachep __read_mostly;
static DEFINE_MUTEX(userns_state_mutex);
--
2.15.1
From b5202296055dd333db4425120d3f93ef4e6a0573 Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Thu, 7 Dec 2017 13:50:48 +0100
Subject: ZEN: Add CONFIG for unprivileged_userns_clone
This way our default behavior continues to match the vanilla kernel.
---
init/Kconfig | 16 ++++++++++++++++
kernel/user_namespace.c | 4 ++++
2 files changed, 20 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig
index 4592bf7997c0..f3df02990aff 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1004,6 +1004,22 @@ config USER_NS
If unsure, say N.
+config USER_NS_UNPRIVILEGED
+ bool "Allow unprivileged users to create namespaces"
+ default y
+ depends on USER_NS
+ help
+ When disabled, unprivileged users will not be able to create
+ new namespaces. Allowing users to create their own namespaces
+ has been part of several recent local privilege escalation
+ exploits, so if you need user namespaces but are
+ paranoid^Wsecurity-conscious you want to disable this.
+
+ This setting can be overridden at runtime via the
+ kernel.unprivileged_userns_clone sysctl.
+
+ If unsure, say Y.
+
config PID_NS
bool "PID Namespaces"
default y
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 6b9dbc257e34..107b17f0d528 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -27,7 +27,11 @@
#include <linux/sort.h>
/* sysctl */
+#ifdef CONFIG_USER_NS_UNPRIVILEGED
+int unprivileged_userns_clone = 1;
+#else
int unprivileged_userns_clone;
+#endif
static struct kmem_cache *user_ns_cachep __read_mostly;
static DEFINE_MUTEX(userns_state_mutex);

View File

@ -0,0 +1,245 @@
From 5ae86c8436b83762bc6cf46bea1da6ace2d3f50e Mon Sep 17 00:00:00 2001
From: Paul Gofman <pgofman@codeweavers.com>
Date: Wed, 6 May 2020 14:37:44 +0300
Subject: [PATCH 1/2] mm: Support soft dirty flag reset for VA range.
---
fs/proc/task_mmu.c | 129 ++++++++++++++++++++++++++++++++++++---------
1 file changed, 103 insertions(+), 26 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 3cec6fbef725..7c7865028f10 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1032,6 +1032,8 @@ enum clear_refs_types {
struct clear_refs_private {
enum clear_refs_types type;
+ unsigned long start, end;
+ bool clear_range;
};
#ifdef CONFIG_MEM_SOFT_DIRTY
@@ -1125,6 +1127,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
spinlock_t *ptl;
struct page *page;
+ BUG_ON(addr < cp->start || end > cp->end);
+
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
@@ -1181,9 +1185,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
struct clear_refs_private *cp = walk->private;
struct vm_area_struct *vma = walk->vma;
- if (vma->vm_flags & VM_PFNMAP)
+ if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP))
return 1;
+ BUG_ON(start < cp->start || end > cp->end);
+
/*
* Writing 1 to /proc/pid/clear_refs affects all pages.
* Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
@@ -1206,10 +1212,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task;
- char buffer[PROC_NUMBUF];
+ char buffer[18];
struct mm_struct *mm;
struct vm_area_struct *vma;
enum clear_refs_types type;
+ unsigned long start, end;
+ bool clear_range;
int itype;
int rv;
@@ -1218,12 +1226,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
- rv = kstrtoint(strstrip(buffer), 10, &itype);
- if (rv < 0)
- return rv;
- type = (enum clear_refs_types)itype;
- if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
- return -EINVAL;
+
+ if (buffer[0] == '6')
+ {
+ static int once;
+
+ if (!once++)
+ printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n");
+
+ if (count != 17)
+ return -EINVAL;
+
+ type = CLEAR_REFS_SOFT_DIRTY;
+ start = *(unsigned long *)(buffer + 1);
+ end = *(unsigned long *)(buffer + 1 + 8);
+ }
+ else
+ {
+ rv = kstrtoint(strstrip(buffer), 10, &itype);
+ if (rv < 0)
+ return rv;
+ type = (enum clear_refs_types)itype;
+
+ if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
+ return -EINVAL;
+
+ start = 0;
+ end = -1UL;
+ }
task = get_proc_task(file_inode(file));
if (!task)
@@ -1235,41 +1265,87 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
.type = type,
};
- if (mmap_write_lock_killable(mm)) {
- count = -EINTR;
- goto out_mm;
+ if (start || end != -1UL)
+ {
+ start = min(start, mm->highest_vm_end) & PAGE_MASK;
+ end = min(end, mm->highest_vm_end) & PAGE_MASK;
+
+ if (start >= end)
+ {
+ count = -EINVAL;
+ goto out_mm;
+ }
+ clear_range = true;
}
+ else
+ {
+ clear_range = false;
+ }
+
+ cp.start = start;
+ cp.end = end;
+ cp.clear_range = clear_range;
+
if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+ if (mmap_write_lock_killable(mm)) {
+ count = -EINTR;
+ goto out_mm;
+ }
+
/*
* Writing 5 to /proc/pid/clear_refs resets the peak
* resident set size to this mm's current rss value.
*/
reset_mm_hiwater_rss(mm);
- goto out_unlock;
+ mmap_write_unlock(mm);
+ goto out_mm;
}
if (type == CLEAR_REFS_SOFT_DIRTY) {
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (!(vma->vm_flags & VM_SOFTDIRTY))
- continue;
- vma->vm_flags &= ~VM_SOFTDIRTY;
- vma_set_page_prot(vma);
+ if (mmap_read_lock_killable(mm)) {
+ count = -EINTR;
+ goto out_mm;
}
-
+ if (!clear_range)
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ if (!(vma->vm_flags & VM_SOFTDIRTY))
+ continue;
+ mmap_read_unlock(mm);
+ if (mmap_write_lock_killable(mm)) {
+ count = -EINTR;
+ goto out_mm;
+ }
+ for (vma = mm->mmap; vma; vma = vma->vm_next) {
+ vma->vm_flags &= ~VM_SOFTDIRTY;
+ vma_set_page_prot(vma);
+ }
+ mmap_write_downgrade(mm);
+ break;
+ }
inc_tlb_flush_pending(mm);
mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
- 0, NULL, mm, 0, -1UL);
+ 0, NULL, mm, start, end);
mmu_notifier_invalidate_range_start(&range);
}
- walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops,
+ else
+ {
+ if (mmap_write_lock_killable(mm)) {
+ count = -EINTR;
+ goto out_mm;
+ }
+ }
+ walk_page_range(mm, start, end == -1UL ? mm->highest_vm_end : end, &clear_refs_walk_ops,
&cp);
if (type == CLEAR_REFS_SOFT_DIRTY) {
mmu_notifier_invalidate_range_end(&range);
flush_tlb_mm(mm);
dec_tlb_flush_pending(mm);
+ mmap_read_unlock(mm);
+ }
+ else
+ {
+ mmap_write_unlock(mm);
}
-out_unlock:
- mmap_write_unlock(mm);
out_mm:
mmput(mm);
}
@@ -1301,6 +1377,7 @@ struct pagemapread {
#define PM_PFRAME_MASK GENMASK_ULL(PM_PFRAME_BITS - 1, 0)
#define PM_SOFT_DIRTY BIT_ULL(55)
#define PM_MMAP_EXCLUSIVE BIT_ULL(56)
+#define PM_SOFT_DIRTY_PAGE BIT_ULL(57)
#define PM_UFFD_WP BIT_ULL(57)
#define PM_FILE BIT_ULL(61)
#define PM_SWAP BIT_ULL(62)
@@ -1373,13 +1450,13 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
flags |= PM_PRESENT;
page = vm_normal_page(vma, addr, pte);
if (pte_soft_dirty(pte))
- flags |= PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
if (pte_uffd_wp(pte))
flags |= PM_UFFD_WP;
} else if (is_swap_pte(pte)) {
swp_entry_t entry;
if (pte_swp_soft_dirty(pte))
- flags |= PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
if (pte_swp_uffd_wp(pte))
flags |= PM_UFFD_WP;
entry = pte_to_swp_entry(pte);
@@ -1500,7 +1500,7 @@
flags |= PM_PRESENT;
if (pmd_soft_dirty(pmd))
- flags |= PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
if (pmd_uffd_wp(pmd))
flags |= PM_UFFD_WP;
if (pm->show_pfn)
@@ -1442,7 +1519,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
}
flags |= PM_SWAP;
if (pmd_swp_soft_dirty(pmd))
- flags |= PM_SOFT_DIRTY;
+ flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
if (pmd_swp_uffd_wp(pmd))
flags |= PM_UFFD_WP;
VM_BUG_ON(!is_pmd_migration_entry(pmd));
--
2.30.2

View File

@ -0,0 +1,360 @@
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 14 Mar 2016 11:10:58 -0600
Subject: [PATCH] pci pme wakeups
Reduce wakeups for PME checks, which are a workaround for miswired
boards (sadly, too many of them) in laptops.
---
drivers/pci/pci.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index c9338f9..6974fbf 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -62,7 +62,7 @@ struct pci_pme_device {
struct pci_dev *dev;
};
-#define PME_TIMEOUT 1000 /* How long between PME checks */
+#define PME_TIMEOUT 4000 /* How long between PME checks */
static void pci_dev_d3_sleep(struct pci_dev *dev)
{
--
https://clearlinux.org
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 19 Mar 2016 21:32:19 -0400
Subject: [PATCH] intel_idle: tweak cpuidle cstates
Increase target_residency in cpuidle cstate
Tune intel_idle to be a bit less agressive;
Clear linux is cleaner in hygiene (wakupes) than the average linux,
so we can afford changing these in a way that increases
performance while keeping power efficiency
---
drivers/idle/intel_idle.c | 44 +++++++++++++++++++--------------------
1 file changed, 22 insertions(+), 22 deletions(-)
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index f449584..c994d24 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -531,7 +531,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
.desc = "MWAIT 0x01",
.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
- .target_residency = 20,
+ .target_residency = 120,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -539,7 +539,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
.desc = "MWAIT 0x10",
.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 33,
- .target_residency = 100,
+ .target_residency = 900,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -547,7 +547,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
.desc = "MWAIT 0x20",
.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 133,
- .target_residency = 400,
+ .target_residency = 1000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -555,7 +555,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
.desc = "MWAIT 0x32",
.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 166,
- .target_residency = 500,
+ .target_residency = 1500,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -563,7 +563,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
.desc = "MWAIT 0x40",
.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 300,
- .target_residency = 900,
+ .target_residency = 2000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -571,7 +571,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
.desc = "MWAIT 0x50",
.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 600,
- .target_residency = 1800,
+ .target_residency = 5000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -579,7 +579,7 @@ static struct cpuidle_state hsw_cstates[] __initdata = {
.desc = "MWAIT 0x60",
.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 2600,
- .target_residency = 7700,
+ .target_residency = 9000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -599,7 +599,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
.desc = "MWAIT 0x01",
.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
- .target_residency = 20,
+ .target_residency = 120,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -607,7 +607,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
.desc = "MWAIT 0x10",
.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 40,
- .target_residency = 100,
+ .target_residency = 1000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -615,7 +615,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
.desc = "MWAIT 0x20",
.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 133,
- .target_residency = 400,
+ .target_residency = 1000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -623,7 +623,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
.desc = "MWAIT 0x32",
.flags = MWAIT2flg(0x32) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 166,
- .target_residency = 500,
+ .target_residency = 2000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -631,7 +631,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
.desc = "MWAIT 0x40",
.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 300,
- .target_residency = 900,
+ .target_residency = 4000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -639,7 +639,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
.desc = "MWAIT 0x50",
.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 600,
- .target_residency = 1800,
+ .target_residency = 7000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -647,7 +647,7 @@ static struct cpuidle_state bdw_cstates[] __initdata = {
.desc = "MWAIT 0x60",
.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 2600,
- .target_residency = 7700,
+ .target_residency = 9000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -668,7 +668,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
.desc = "MWAIT 0x01",
.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
- .target_residency = 20,
+ .target_residency = 120,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -676,7 +676,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
.desc = "MWAIT 0x10",
.flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 70,
- .target_residency = 100,
+ .target_residency = 1000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -684,7 +684,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
.desc = "MWAIT 0x20",
.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 85,
- .target_residency = 200,
+ .target_residency = 600,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -692,7 +692,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
.desc = "MWAIT 0x33",
.flags = MWAIT2flg(0x33) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 124,
- .target_residency = 800,
+ .target_residency = 3000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -700,7 +700,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
.desc = "MWAIT 0x40",
.flags = MWAIT2flg(0x40) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 200,
- .target_residency = 800,
+ .target_residency = 3200,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -708,7 +708,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
.desc = "MWAIT 0x50",
.flags = MWAIT2flg(0x50) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 480,
- .target_residency = 5000,
+ .target_residency = 9000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -716,7 +716,7 @@ static struct cpuidle_state skl_cstates[] __initdata = {
.desc = "MWAIT 0x60",
.flags = MWAIT2flg(0x60) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 890,
- .target_residency = 5000,
+ .target_residency = 9000,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
@@ -737,7 +737,7 @@ static struct cpuidle_state skx_cstates[] __initdata = {
.desc = "MWAIT 0x01",
.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
.exit_latency = 10,
- .target_residency = 20,
+ .target_residency = 300,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
--
https://clearlinux.org
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 6 Jan 2017 15:34:09 +0000
Subject: [PATCH] ipv4/tcp: allow the memory tuning for tcp to go a little
bigger than default
---
net/ipv4/tcp.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 30c1142..4345075 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -4201,8 +4201,8 @@ void __init tcp_init(void)
tcp_init_mem();
/* Set per-socket limits to no more than 1/128 the pressure threshold */
limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
- max_wshare = min(4UL*1024*1024, limit);
- max_rshare = min(6UL*1024*1024, limit);
+ max_wshare = min(16UL*1024*1024, limit);
+ max_rshare = min(16UL*1024*1024, limit);
init_net.ipv4.sysctl_tcp_wmem[0] = SK_MEM_QUANTUM;
init_net.ipv4.sysctl_tcp_wmem[1] = 16*1024;
--
https://clearlinux.org
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 18 Feb 2018 23:35:41 +0000
Subject: [PATCH] locking: rwsem: spin faster
tweak rwsem owner spinning a bit
---
kernel/locking/rwsem.c | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index f11b9bd..1bbfcc1 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -717,6 +717,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
struct task_struct *new, *owner;
unsigned long flags, new_flags;
enum owner_state state;
+ int i = 0;
owner = rwsem_owner_flags(sem, &flags);
state = rwsem_owner_state(owner, flags, nonspinnable);
@@ -750,7 +751,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable)
break;
}
- cpu_relax();
+ if (i++ > 1000)
+ cpu_relax();
}
rcu_read_unlock();
--
https://clearlinux.org
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 2 Jun 2016 23:36:32 -0500
Subject: [PATCH] initialize ata before graphics
ATA init is the long pole in the boot process, and its asynchronous.
move the graphics init after it so that ata and graphics initialize
in parallel
---
drivers/Makefile | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/drivers/Makefile b/drivers/Makefile
index c0cd1b9..af1e2fb 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -59,15 +59,8 @@ obj-y += char/
# iommu/ comes before gpu as gpu are using iommu controllers
obj-y += iommu/
-# gpu/ comes after char for AGP vs DRM startup and after iommu
-obj-y += gpu/
-
obj-$(CONFIG_CONNECTOR) += connector/
-# i810fb and intelfb depend on char/agp/
-obj-$(CONFIG_FB_I810) += video/fbdev/i810/
-obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/
-
obj-$(CONFIG_PARPORT) += parport/
obj-$(CONFIG_NVM) += lightnvm/
obj-y += base/ block/ misc/ mfd/ nfc/
@@ -80,6 +73,14 @@ obj-$(CONFIG_IDE) += ide/
obj-y += scsi/
obj-y += nvme/
obj-$(CONFIG_ATA) += ata/
+
+# gpu/ comes after char for AGP vs DRM startup and after iommu
+obj-y += gpu/
+
+# i810fb and intelfb depend on char/agp/
+obj-$(CONFIG_FB_I810) += video/fbdev/i810/
+obj-$(CONFIG_FB_INTEL) += video/fbdev/intelfb/
+
obj-$(CONFIG_TARGET_CORE) += target/
obj-$(CONFIG_MTD) += mtd/
obj-$(CONFIG_SPI) += spi/
--
https://clearlinux.org

View File

@ -0,0 +1,363 @@
From 9c85113cf4019e7b277a44e72bda8b78347aa72f Mon Sep 17 00:00:00 2001
From: Paul Gofman <pgofman@codeweavers.com>
Date: Thu, 7 May 2020 14:05:31 +0300
Subject: [PATCH 2/2] mm: Support soft dirty flag read with reset.
---
fs/proc/base.c | 3 +
fs/proc/internal.h | 1 +
fs/proc/task_mmu.c | 144 +++++++++++++++++++++++++++++++++++++++------
3 files changed, 130 insertions(+), 18 deletions(-)
diff --git a/fs/proc/base.c b/fs/proc/base.c
index b3422cda2a91..8199ae2411ca 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3202,6 +3202,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("smaps", S_IRUGO, proc_pid_smaps_operations),
REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap", S_IRUSR, proc_pagemap_operations),
+#ifdef CONFIG_MEM_SOFT_DIRTY
+ REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations),
+#endif
#endif
#ifdef CONFIG_SECURITY
DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f60b379dcdc7..36a901cf0e7f 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -303,6 +303,7 @@ extern const struct file_operations proc_pid_smaps_operations;
extern const struct file_operations proc_pid_smaps_rollup_operations;
extern const struct file_operations proc_clear_refs_operations;
extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_pagemap_reset_operations;
extern unsigned long task_vsize(struct mm_struct *);
extern unsigned long task_statm(struct mm_struct *,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 7c7865028f10..a21694967915 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1056,8 +1056,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
return page_maybe_dma_pinned(page);
}
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
- unsigned long addr, pte_t *pte)
+static inline bool clear_soft_dirty(struct vm_area_struct *vma,
+ unsigned long addr, pte_t *pte)
{
/*
* The soft-dirty tracker uses #PF-s to catch writes
@@ -1066,37 +1066,46 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
* of how soft-dirty works.
*/
pte_t ptent = *pte;
+ bool ret = false;
if (pte_present(ptent)) {
pte_t old_pte;
if (pte_is_pinned(vma, addr, ptent))
- return;
+ return ret;
old_pte = ptep_modify_prot_start(vma, addr, pte);
+ ret = pte_soft_dirty(old_pte);
ptent = pte_wrprotect(old_pte);
ptent = pte_clear_soft_dirty(ptent);
ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
} else if (is_swap_pte(ptent)) {
+ ret = pte_swp_soft_dirty(ptent);
ptent = pte_swp_clear_soft_dirty(ptent);
set_pte_at(vma->vm_mm, addr, pte, ptent);
}
+ return ret;
}
#else
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty(struct vm_area_struct *vma,
unsigned long addr, pte_t *pte)
{
+ return false;
}
#endif
#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
pmd_t old, pmd = *pmdp;
+ bool ret = false;
if (pmd_present(pmd)) {
/* See comment in change_huge_pmd() */
old = pmdp_invalidate(vma, addr, pmdp);
+
+ ret = pmd_soft_dirty(old);
+
if (pmd_dirty(old))
pmd = pmd_mkdirty(pmd);
if (pmd_young(old))
@@ -1107,14 +1116,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+ ret = pmd_swp_soft_dirty(pmd);
pmd = pmd_swp_clear_soft_dirty(pmd);
set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
}
+ return ret;
}
#else
-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
+ return false;
}
#endif
@@ -1367,6 +1379,7 @@ struct pagemapread {
int pos, len; /* units: PM_ENTRY_BYTES, not bytes */
pagemap_entry_t *buffer;
bool show_pfn;
+ bool reset;
};
#define PAGEMAP_WALK_SIZE (PMD_SIZE)
@@ -1398,6 +1411,14 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
return 0;
}
+static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm)
+{
+ ((unsigned long *)pm->buffer)[pm->pos++] = addr;
+ if (pm->pos >= pm->len)
+ return PM_END_OF_BUFFER;
+ return 0;
+}
+
static int pagemap_pte_hole(unsigned long start, unsigned long end,
__always_unused int depth, struct mm_walk *walk)
{
@@ -1405,6 +1426,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
unsigned long addr = start;
int err = 0;
+ if (pm->reset)
+ goto out;
+
while (addr < end) {
struct vm_area_struct *vma = find_vma(walk->mm, addr);
pagemap_entry_t pme = make_pme(0, 0);
@@ -1439,8 +1463,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
}
static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
- struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+ struct vm_area_struct *vma, unsigned long addr, pte_t *pte_addr)
{
+ pte_t pte = *pte_addr;
u64 frame = 0, flags = 0;
struct page *page = NULL;
@@ -1493,6 +1518,20 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
pmd_t pmd = *pmdp;
struct page *page = NULL;
+ if (pm->reset)
+ {
+ if (clear_soft_dirty_pmd(vma, addr, pmdp))
+ {
+ for (; addr != end; addr += PAGE_SIZE)
+ {
+ err = add_addr_to_pagemap(addr, pm);
+ if (err)
+ break;
+ }
+ }
+ goto trans_huge_done;
+ }
+
if (vma->vm_flags & VM_SOFTDIRTY)
flags |= PM_SOFT_DIRTY;
@@ -1541,6 +1580,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
frame += (1 << MAX_SWAPFILES_SHIFT);
}
}
+trans_huge_done:
spin_unlock(ptl);
return err;
}
@@ -1555,10 +1595,18 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
*/
orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
for (; addr < end; pte++, addr += PAGE_SIZE) {
- pagemap_entry_t pme;
+ if (pm->reset)
+ {
+ if (clear_soft_dirty(vma, addr, pte))
+ err = add_addr_to_pagemap(addr, pm);
+ }
+ else
+ {
+ pagemap_entry_t pme;
- pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
- err = add_to_pagemap(addr, &pme, pm);
+ pme = pte_to_pagemap_entry(pm, vma, addr, pte);
+ err = add_to_pagemap(addr, &pme, pm);
+ }
if (err)
break;
}
@@ -1650,8 +1698,8 @@ static const struct mm_walk_ops pagemap_ops = {
* determine which areas of memory are actually mapped and llseek to
* skip over unmapped regions.
*/
-static ssize_t pagemap_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+static ssize_t do_pagemap_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos, bool reset)
{
struct mm_struct *mm = file->private_data;
struct pagemapread pm;
@@ -1660,6 +1708,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
unsigned long start_vaddr;
unsigned long end_vaddr;
int ret = 0, copied = 0;
+ struct mmu_notifier_range range;
+ size_t buffer_len;
if (!mm || !mmget_not_zero(mm))
goto out;
@@ -1675,19 +1725,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
/* do not disclose physical addresses: attack vector */
pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
+ pm.reset = reset;
- pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
- pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
+ buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES);
+
+ pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL);
ret = -ENOMEM;
if (!pm.buffer)
goto out_mm;
src = *ppos;
svpfn = src / PM_ENTRY_BYTES;
- end_vaddr = mm->task_size;
+
+ start_vaddr = svpfn << PAGE_SHIFT;
+
+ if (reset)
+ {
+ if (count < sizeof(end_vaddr))
+ {
+ ret = -EINVAL;
+ goto out_mm;
+ }
+ if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr)))
+ return -EFAULT;
+ end_vaddr = min(end_vaddr, mm->task_size);
+ }
+ else
+ {
+ end_vaddr = mm->task_size;
+ start_vaddr = end_vaddr;
+ }
/* watch out for wraparound */
- start_vaddr = end_vaddr;
if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
@@ -1707,18 +1776,35 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
unsigned long end;
pm.pos = 0;
- end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
+ pm.len = min(buffer_len, count / PM_ENTRY_BYTES);
+
+ end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT));
/* overflow ? */
if (end < start_vaddr || end > end_vaddr)
end = end_vaddr;
+
ret = mmap_read_lock_killable(mm);
if (ret)
goto out_free;
+
+ if (reset)
+ {
+ inc_tlb_flush_pending(mm);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
+ 0, NULL, mm, start_vaddr, end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
+ if (reset)
+ {
+ mmu_notifier_invalidate_range_end(&range);
+ flush_tlb_mm(mm);
+ dec_tlb_flush_pending(mm);
+ }
mmap_read_unlock(mm);
- start_vaddr = end;
len = min(count, PM_ENTRY_BYTES * pm.pos);
+ BUG_ON(ret && ret != PM_END_OF_BUFFER);
if (copy_to_user(buf, pm.buffer, len)) {
ret = -EFAULT;
goto out_free;
@@ -1726,6 +1812,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
copied += len;
buf += len;
count -= len;
+
+ start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end;
}
*ppos += copied;
if (!ret || ret == PM_END_OF_BUFFER)
@@ -1739,6 +1827,18 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
return ret;
}
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return do_pagemap_read(file, buf, count, ppos, false);
+}
+
+static ssize_t pagemap_reset_read(struct file *file, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ return do_pagemap_read(file, buf, count, ppos, true);
+}
+
static int pagemap_open(struct inode *inode, struct file *file)
{
struct mm_struct *mm;
@@ -1765,6 +1865,14 @@ const struct file_operations proc_pagemap_operations = {
.open = pagemap_open,
.release = pagemap_release,
};
+
+const struct file_operations proc_pagemap_reset_operations = {
+ .llseek = mem_lseek, /* borrow this */
+ .read = pagemap_reset_read,
+ .open = pagemap_open,
+ .release = pagemap_release,
+};
+
#endif /* CONFIG_PROC_PAGE_MONITOR */
#ifdef CONFIG_NUMA
--
2.30.2

View File

@ -0,0 +1,676 @@
From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001
From: Tk-Glitch <ti3nou@gmail.com>
Date: Wed, 4 Jul 2018 04:30:08 +0200
Subject: [PATCH 01/17] glitched
---
scripts/mkcompile_h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index baf3ab8d9d49..854e32e6aec7 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -41,8 +41,8 @@ else
fi
UTS_VERSION="#$VERSION"
-CONFIG_FLAGS=""
-if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
+CONFIG_FLAGS="TKG"
+if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi
if [ -n "$PREEMPT_RT" ] ; then
CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT_RT"
--
2.28.0
From c304f43d14e98d4bf1215fc10bc5012f554bdd8a Mon Sep 17 00:00:00 2001
From: Alexandre Frade <admfrade@gmail.com>
Date: Mon, 29 Jan 2018 16:59:22 +0000
Subject: [PATCH 02/17] dcache: cache_pressure = 50 decreases the rate at which
VFS caches are reclaimed
Signed-off-by: Alexandre Frade <admfrade@gmail.com>
---
fs/dcache.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/fs/dcache.c b/fs/dcache.c
index 361ea7ab30ea..0c5cf69b241a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -71,7 +71,7 @@
* If no ancestor relationship:
* arbitrary, since it's serialized on rename_lock
*/
-int sysctl_vfs_cache_pressure __read_mostly = 100;
+int sysctl_vfs_cache_pressure __read_mostly = 50;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
--
2.28.0
From 28f32f59d9d55ac7ec3a20b79bdd02d2a0a5f7e1 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <admfrade@gmail.com>
Date: Mon, 29 Jan 2018 18:29:13 +0000
Subject: [PATCH 03/17] sched/core: nr_migrate = 128 increases number of tasks
to iterate in a single balance run.
Signed-off-by: Alexandre Frade <admfrade@gmail.com>
---
kernel/sched/core.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f788cd61df21..2bfbb4213707 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -59,7 +59,7 @@ const_debug unsigned int sysctl_sched_features =
#ifdef CONFIG_PREEMPT_RT
const_debug unsigned int sysctl_sched_nr_migrate = 8;
#else
-const_debug unsigned int sysctl_sched_nr_migrate = 32;
+const_debug unsigned int sysctl_sched_nr_migrate = 128;
#endif
/*
@@ -71,9 +71,9 @@ __read_mostly int scheduler_running;
/*
* part of the period that we allow rt tasks to run in us.
- * default: 0.95s
+ * XanMod default: 0.98s
*/
-int sysctl_sched_rt_runtime = 950000;
+int sysctl_sched_rt_runtime = 980000;
/*
* __task_rq_lock - lock the rq @p resides on.
--
2.28.0
From acc49f33a10f61dc66c423888cbb883ba46710e4 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <admfrade@gmail.com>
Date: Mon, 29 Jan 2018 17:41:29 +0000
Subject: [PATCH 04/17] scripts: disable the localversion "+" tag of a git repo
Signed-off-by: Alexandre Frade <admfrade@gmail.com>
---
scripts/setlocalversion | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/scripts/setlocalversion b/scripts/setlocalversion
index 20f2efd57b11..0552d8b9f582 100755
--- a/scripts/setlocalversion
+++ b/scripts/setlocalversion
@@ -54,7 +54,7 @@ scm_version()
# If only the short version is requested, don't bother
# running further git commands
if $short; then
- echo "+"
+ # echo "+"
return
fi
# If we are past a tagged commit (like
--
2.28.0
From 61fcb33fb0de8bc0f060e0a1ada38ed149217f4d Mon Sep 17 00:00:00 2001
From: Oleksandr Natalenko <oleksandr@redhat.com>
Date: Wed, 11 Dec 2019 11:46:19 +0100
Subject: [PATCH 05/17] init/Kconfig: enable -O3 for all arches
Building a kernel with -O3 may help in hunting bugs like [1] and thus
using this switch should not be restricted to one specific arch only.
With that, lets expose it for everyone.
[1] https://lore.kernel.org/lkml/673b885183fb64f1cbb3ed2387524077@natalenko.name/
Signed-off-by: Oleksandr Natalenko <oleksandr@redhat.com>
---
init/Kconfig | 1 -
1 file changed, 1 deletion(-)
diff --git a/init/Kconfig b/init/Kconfig
index 0498af567f70..3ae8678e1145 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1278,7 +1278,6 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
config CC_OPTIMIZE_FOR_PERFORMANCE_O3
bool "Optimize more for performance (-O3)"
- depends on ARC
help
Choosing this option will pass "-O3" to your compiler to optimize
the kernel yet more for performance.
--
2.28.0
From 360c6833e07cc9fdef5746f6bc45bdbc7212288d Mon Sep 17 00:00:00 2001
From: "Jan Alexander Steffens (heftig)" <jan.steffens@gmail.com>
Date: Fri, 26 Oct 2018 11:22:33 +0100
Subject: [PATCH 06/17] infiniband: Fix __read_overflow2 error with -O3
inlining
---
drivers/infiniband/core/addr.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
index 3a98439bba83..6efc4f907f58 100644
--- a/drivers/infiniband/core/addr.c
+++ b/drivers/infiniband/core/addr.c
@@ -820,6 +820,7 @@ int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid,
union {
struct sockaddr_in _sockaddr_in;
struct sockaddr_in6 _sockaddr_in6;
+ struct sockaddr_ib _sockaddr_ib;
} sgid_addr, dgid_addr;
int ret;
--
2.28.0
From f85ed068b4d0e6c31edce8574a95757a60e58b87 Mon Sep 17 00:00:00 2001
From: Etienne Juvigny <Ti3noU@gmail.com>
Date: Mon, 3 Sep 2018 17:36:25 +0200
Subject: [PATCH 07/17] Zenify & stuff
---
init/Kconfig | 32 ++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 25 +++++++++++++++++++++++++
mm/page-writeback.c | 8 ++++++++
3 files changed, 65 insertions(+)
diff --git a/init/Kconfig b/init/Kconfig
index 3ae8678e1145..da708eed0f1e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -92,6 +92,38 @@ config THREAD_INFO_IN_TASK
menu "General setup"
+config ZENIFY
+ bool "A selection of patches from Zen/Liquorix kernel and additional tweaks for a better gaming experience"
+ default y
+ help
+ Tunes the kernel for responsiveness at the cost of throughput and power usage.
+
+ --- Virtual Memory Subsystem ---------------------------
+
+ Mem dirty before bg writeback..: 10 % -> 20 %
+ Mem dirty before sync writeback: 20 % -> 50 %
+
+ --- Block Layer ----------------------------------------
+
+ Queue depth...............: 128 -> 512
+ Default MQ scheduler......: mq-deadline -> bfq
+
+ --- CFS CPU Scheduler ----------------------------------
+
+ Scheduling latency.............: 6 -> 3 ms
+ Minimal granularity............: 0.75 -> 0.3 ms
+ Wakeup granularity.............: 1 -> 0.5 ms
+ CPU migration cost.............: 0.5 -> 0.25 ms
+ Bandwidth slice size...........: 5 -> 3 ms
+ Ondemand fine upscaling limit..: 95 % -> 85 %
+
+ --- MuQSS CPU Scheduler --------------------------------
+
+ Scheduling interval............: 6 -> 3 ms
+ ISO task max realtime use......: 70 % -> 25 %
+ Ondemand coarse upscaling limit: 80 % -> 45 %
+ Ondemand fine upscaling limit..: 95 % -> 45 %
+
config BROKEN
bool
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b3b59cc51d6..2a0072192c3d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,8 +37,13 @@
*
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*/
+#ifdef CONFIG_ZENIFY
+unsigned int sysctl_sched_latency = 3000000ULL;
+static unsigned int normalized_sysctl_sched_latency = 3000000ULL;
+#else
unsigned int sysctl_sched_latency = 6000000ULL;
static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+#endif
/*
* The initial- and re-scaling of tunables is configurable
@@ -58,21 +63,34 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
+#ifdef CONFIG_ZENIFY
+unsigned int sysctl_sched_min_granularity = 300000ULL;
+static unsigned int normalized_sysctl_sched_min_granularity = 300000ULL;
+#else
unsigned int sysctl_sched_min_granularity = 750000ULL;
static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+#endif
/*
* Minimal preemption granularity for CPU-bound SCHED_IDLE tasks.
* Applies only when SCHED_IDLE tasks compete with normal tasks.
*
* (default: 0.75 msec)
*/
+#ifdef CONFIG_ZENIFY
+unsigned int sysctl_sched_idle_min_granularity = 300000ULL;
+#else
unsigned int sysctl_sched_idle_min_granularity = 750000ULL;
+#endif
/*
* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
*/
+#ifdef CONFIG_ZENIFY
+static unsigned int sched_nr_latency = 10;
+#else
static unsigned int sched_nr_latency = 8;
+#endif
/*
* After fork, child runs first. If set to 0 (default) then
@@ -128,8 +149,12 @@ int __weak arch_asym_cpu_priority(int cpu)
*
* (default: 5 msec, units: microseconds)
*/
+#ifdef CONFIG_ZENIFY
+unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL;
+#else
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
+#endif
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 28b3e7a67565..01a1aef2b9b1 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -71,7 +71,11 @@ static long ratelimit_pages = 32;
/*
* Start background writeback (via writeback threads) at this percentage
*/
+#ifdef CONFIG_ZENIFY
+int dirty_background_ratio = 20;
+#else
int dirty_background_ratio = 10;
+#endif
/*
* dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -88,7 +92,11 @@ int vm_highmem_is_dirtyable;
/*
* The generator of dirty data starts writeback at this percentage
*/
+#ifdef CONFIG_ZENIFY
+int vm_dirty_ratio = 50;
+#else
int vm_dirty_ratio = 20;
+#endif
/*
* vm_dirty_bytes starts at 0 (disabled) so that it is a function of
--
2.28.0
From e92e67143385cf285851e12aa8b7f083dd38dd24 Mon Sep 17 00:00:00 2001
From: Steven Barrett <damentz@liquorix.net>
Date: Sun, 16 Jan 2011 18:57:32 -0600
Subject: [PATCH 08/17] ZEN: Allow TCP YeAH as default congestion control
4.4: In my tests YeAH dramatically slowed down transfers over a WLAN,
reducing throughput from ~65Mbps (CUBIC) to ~7MBps (YeAH) over 10
seconds (netperf TCP_STREAM) including long stalls.
Be careful when choosing this. ~heftig
---
net/ipv4/Kconfig | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index e64e59b536d3..bfb55ef7ebbe 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -691,6 +691,9 @@ choice
config DEFAULT_VEGAS
bool "Vegas" if TCP_CONG_VEGAS=y
+ config DEFAULT_YEAH
+ bool "YeAH" if TCP_CONG_YEAH=y
+
config DEFAULT_VENO
bool "Veno" if TCP_CONG_VENO=y
@@ -724,6 +727,7 @@ config DEFAULT_TCP_CONG
default "htcp" if DEFAULT_HTCP
default "hybla" if DEFAULT_HYBLA
default "vegas" if DEFAULT_VEGAS
+ default "yeah" if DEFAULT_YEAH
default "westwood" if DEFAULT_WESTWOOD
default "veno" if DEFAULT_VENO
default "reno" if DEFAULT_RENO
--
2.28.0
From 76dbe7477bfde1b5e8bf29a71b5af7ab2be9b98e Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Wed, 28 Nov 2018 19:01:27 -0600
Subject: [PATCH 09/17] zen: Use [defer+madvise] as default khugepaged defrag
strategy
For some reason, the default strategy to respond to THP fault fallbacks
is still just madvise, meaning stall if the program wants transparent
hugepages, but don't trigger a background reclaim / compaction if THP
begins to fail allocations. This creates a snowball affect where we
still use the THP code paths, but we almost always fail once a system
has been active and busy for a while.
The option "defer" was created for interactive systems where THP can
still improve performance. If we have to fallback to a regular page due
to an allocation failure or anything else, we will trigger a background
reclaim and compaction so future THP attempts succeed and previous
attempts eventually have their smaller pages combined without stalling
running applications.
We still want madvise to stall applications that explicitely want THP,
so defer+madvise _does_ make a ton of sense. Make it the default for
interactive systems, especially if the kernel maintainer left
transparent hugepages on "always".
Reasoning and details in the original patch: https://lwn.net/Articles/711248/
---
mm/huge_memory.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 74300e337c3c..9277f22c10a7 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -53,7 +53,11 @@ unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE
(1<<TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)|
#endif
+#ifdef CONFIG_ZENIFY
+ (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG)|
+#else
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG)|
+#endif
(1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG)|
(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
--
2.28.0
From 2b65a1329cb220b43c19c4d0de5833fae9e2b22d Mon Sep 17 00:00:00 2001
From: Alexandre Frade <admfrade@gmail.com>
Date: Wed, 24 Oct 2018 16:58:52 -0300
Subject: [PATCH 10/17] net/sched: allow configuring cake qdisc as default
Signed-off-by: Alexandre Frade <admfrade@gmail.com>
---
net/sched/Kconfig | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index 84badf00647e..6a922bca9f39 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -471,6 +471,9 @@ choice
config DEFAULT_SFQ
bool "Stochastic Fair Queue" if NET_SCH_SFQ
+ config DEFAULT_CAKE
+ bool "Common Applications Kept Enhanced" if NET_SCH_CAKE
+
config DEFAULT_PFIFO_FAST
bool "Priority FIFO Fast"
endchoice
@@ -481,6 +484,7 @@ config DEFAULT_NET_SCH
default "fq" if DEFAULT_FQ
default "fq_codel" if DEFAULT_FQ_CODEL
default "sfq" if DEFAULT_SFQ
+ default "cake" if DEFAULT_CAKE
default "pfifo_fast"
endif
--
2.28.0
From 816ee502759e954304693813bd03d94986b28dba Mon Sep 17 00:00:00 2001
From: Tk-Glitch <ti3nou@gmail.com>
Date: Mon, 18 Feb 2019 17:40:57 +0100
Subject: [PATCH 11/17] mm: Set watermark_scale_factor to 200 (from 10)
Multiple users have reported it's helping reducing/eliminating stuttering
with DXVK.
---
mm/page_alloc.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 898ff44f2c7b..e72074034793 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -330,7 +330,7 @@ int watermark_boost_factor __read_mostly;
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
int watermark_boost_factor __read_mostly = 15000;
-int watermark_scale_factor = 10;
+int watermark_scale_factor = 200;
static unsigned long nr_kernel_pages __initdata;
static unsigned long nr_all_pages __initdata;
--
2.28.0
From 90240bcd90a568878738e66c0d45bed3e38e347b Mon Sep 17 00:00:00 2001
From: Tk-Glitch <ti3nou@gmail.com>
Date: Fri, 19 Apr 2019 12:33:38 +0200
Subject: [PATCH 12/17] Set vm.max_map_count to 262144 by default
The value is still pretty low, and AMD64-ABI and ELF extended numbering
supports that, so we should be fine on modern x86 systems.
This fixes crashes in some applications using more than 65535 vmas (also
affects some windows games running in wine, such as Star Citizen).
---
include/linux/mm.h | 3 +--
1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bc05c3588aa3..b0cefe94920d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -190,8 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page)
* not a hard limit any more. Although some userspace tools can be surprised by
* that.
*/
-#define MAPCOUNT_ELF_CORE_MARGIN (5)
-#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
+#define DEFAULT_MAX_MAP_COUNT (262144)
extern int sysctl_max_map_count;
--
2.28.0
From 3a34034dba5efe91bcec491efe8c66e8087f509b Mon Sep 17 00:00:00 2001
From: Tk-Glitch <ti3nou@gmail.com>
Date: Mon, 27 Jul 2020 00:19:18 +0200
Subject: [PATCH 13/17] mm: bump DEFAULT_MAX_MAP_COUNT
Some games such as Detroit: Become Human tend to be very crash prone with
lower values.
---
include/linux/mm.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index b0cefe94920d..890165099b07 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -190,7 +190,7 @@ static inline void __mm_zero_struct_page(struct page *page)
* not a hard limit any more. Although some userspace tools can be surprised by
* that.
*/
-#define DEFAULT_MAX_MAP_COUNT (262144)
+#define DEFAULT_MAX_MAP_COUNT (524288)
extern int sysctl_max_map_count;
--
2.28.0
From 977812938da7c7226415778c340832141d9278b7 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <admfrade@gmail.com>
Date: Mon, 25 Nov 2019 15:13:06 -0300
Subject: [PATCH 14/17] elevator: set default scheduler to bfq for blk-mq
Signed-off-by: Alexandre Frade <admfrade@gmail.com>
---
block/elevator.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/block/elevator.c b/block/elevator.c
index 4eab3d70e880..79669aa39d79 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -623,19 +623,19 @@ static inline bool elv_support_iosched(struct request_queue *q)
}
/*
- * For single queue devices, default to using mq-deadline. If we have multiple
- * queues or mq-deadline is not available, default to "none".
+ * For single queue devices, default to using bfq. If we have multiple
+ * queues or bfq is not available, default to "none".
*/
static struct elevator_type *elevator_get_default(struct request_queue *q)
{
if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
return NULL;
if (q->nr_hw_queues != 1 &&
!blk_mq_is_shared_tags(q->tag_set->flags))
return NULL;
- return elevator_get(q, "mq-deadline", false);
+ return elevator_get(q, "bfq", false);
}
/*
--
2.28.0
From 3c229f434aca65c4ca61772bc03c3e0370817b92 Mon Sep 17 00:00:00 2001
From: Alexandre Frade <kernel@xanmod.org>
Date: Mon, 3 Aug 2020 17:05:04 +0000
Subject: [PATCH 16/17] mm: set 2 megabytes for address_space-level file
read-ahead pages size
Signed-off-by: Alexandre Frade <kernel@xanmod.org>
---
include/linux/pagemap.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index cf2468da68e9..007dea784451 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -655,7 +655,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
void delete_from_page_cache_batch(struct address_space *mapping,
struct pagevec *pvec);
-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE)
+#define VM_READAHEAD_PAGES (SZ_2M / PAGE_SIZE)
void page_cache_sync_readahead(struct address_space *, struct file_ra_state *,
struct file *, pgoff_t index, unsigned long req_count);
--
2.28.0
From 716f41cf6631f3a85834dcb67b4ce99185b6387f Mon Sep 17 00:00:00 2001
From: Steven Barrett <steven@liquorix.net>
Date: Wed, 15 Jan 2020 20:43:56 -0600
Subject: [PATCH 17/17] ZEN: intel-pstate: Implement "enable" parameter
If intel-pstate is compiled into the kernel, it will preempt the loading
of acpi-cpufreq so you can take advantage of hardware p-states without
any friction.
However, intel-pstate is not completely superior to cpufreq's ondemand
for one reason. There's no concept of an up_threshold property.
In ondemand, up_threshold essentially reduces the maximum utilization to
compare against, allowing you to hit max frequencies and turbo boost
from a much lower core utilization.
With intel-pstate, you have the concept of minimum and maximum
performance, but no tunable that lets you define, maximum frequency
means 50% core utilization. For just this oversight, there's reasons
you may want ondemand.
Lets support setting "enable" in kernel boot parameters. This lets
kernel maintainers include "intel_pstate=disable" statically in the
static boot parameters, but let users of the kernel override this
selection.
---
Documentation/admin-guide/kernel-parameters.txt | 3 +++
drivers/cpufreq/intel_pstate.c | 2 ++
2 files changed, 5 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index fb95fad81c79..3e92fee81e33 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1857,6 +1857,9 @@
disable
Do not enable intel_pstate as the default
scaling driver for the supported processors
+ enable
+ Enable intel_pstate in-case "disable" was passed
+ previously in the kernel boot parameters
passive
Use intel_pstate as a scaling driver, but configure it
to work with generic cpufreq governors (instead of
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 36a469150ff9..aee891c9b78a 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -2845,6 +2845,8 @@ static int __init intel_pstate_setup(char *str)
if (!strcmp(str, "no_hwp"))
no_hwp = 1;
+ if (!strcmp(str, "enable"))
+ no_load = 0;
if (!strcmp(str, "force"))
force_load = 1;
if (!strcmp(str, "hwp_only"))
--
2.28.0

View File

@ -0,0 +1,22 @@
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b3b59cc51d6..2a0072192c3d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -81,10 +95,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
*
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
+#ifdef CONFIG_ZENIFY
+unsigned int sysctl_sched_wakeup_granularity = 500000UL;
+static unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL;
+
+const_debug unsigned int sysctl_sched_migration_cost = 50000UL;
+#else
unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+#endif
int sched_thermal_decay_shift;
static int __init setup_sched_thermal_decay_shift(char *str)

View File

@ -0,0 +1,90 @@
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..1d9c7ed79b11 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -4,7 +4,7 @@
choice
prompt "Timer frequency"
- default HZ_250
+ default HZ_500
help
Allows the configuration of the timer frequency. It is customary
to have the timer interrupt run at 1000 Hz but 100 Hz may be more
@@ -39,6 +39,13 @@ choice
on SMP and NUMA systems and exactly dividing by both PAL and
NTSC frame rates for video and multimedia work.
+ config HZ_500
+ bool "500 HZ"
+ help
+ 500 Hz is a balanced timer frequency. Provides fast interactivity
+ on desktops with great smoothness without increasing CPU power
+ consumption and sacrificing the battery life on laptops.
+
config HZ_1000
bool "1000 HZ"
help
@@ -52,6 +59,7 @@ config HZ
default 100 if HZ_100
default 250 if HZ_250
default 300 if HZ_300
+ default 500 if HZ_500
default 1000 if HZ_1000
config SCHED_HRTICK
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a846757..1d9c7ed79b11 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -4,7 +4,7 @@
choice
prompt "Timer frequency"
- default HZ_500
+ default HZ_750
help
Allows the configuration of the timer frequency. It is customary
to have the timer interrupt run at 1000 Hz but 100 Hz may be more
@@ -46,6 +46,13 @@ choice
on desktops with great smoothness without increasing CPU power
consumption and sacrificing the battery life on laptops.
+ config HZ_750
+ bool "750 HZ"
+ help
+ 750 Hz is a good timer frequency for desktops. Provides fast
+ interactivity with great smoothness without sacrificing too
+ much throughput.
+
config HZ_1000
bool "1000 HZ"
help
@@ -60,6 +67,7 @@ config HZ
default 250 if HZ_250
default 300 if HZ_300
default 500 if HZ_500
+ default 750 if HZ_750
default 1000 if HZ_1000
config SCHED_HRTICK
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 6b423eebfd5d..61e3271675d6 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -21,10 +21,10 @@
#include "cpufreq_ondemand.h"
/* On-demand governor macros */
-#define DEF_FREQUENCY_UP_THRESHOLD (80)
-#define DEF_SAMPLING_DOWN_FACTOR (1)
+#define DEF_FREQUENCY_UP_THRESHOLD (55)
+#define DEF_SAMPLING_DOWN_FACTOR (5)
#define MAX_SAMPLING_DOWN_FACTOR (100000)
-#define MICRO_FREQUENCY_UP_THRESHOLD (95)
+#define MICRO_FREQUENCY_UP_THRESHOLD (63)
#define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000)
#define MIN_FREQUENCY_UP_THRESHOLD (1)
#define MAX_FREQUENCY_UP_THRESHOLD (100)

View File

@ -0,0 +1,193 @@
From cdeab384f48dd9c88e2dff2e9ad8d57dca1a1b1c Mon Sep 17 00:00:00 2001
From: Mark Weiman <mark.weiman@markzz.com>
Date: Sun, 12 Aug 2018 11:36:21 -0400
Subject: [PATCH] pci: Enable overrides for missing ACS capabilities
This an updated version of Alex Williamson's patch from:
https://lkml.org/lkml/2013/5/30/513
Original commit message follows:
PCIe ACS (Access Control Services) is the PCIe 2.0+ feature that
allows us to control whether transactions are allowed to be redirected
in various subnodes of a PCIe topology. For instance, if two
endpoints are below a root port or downsteam switch port, the
downstream port may optionally redirect transactions between the
devices, bypassing upstream devices. The same can happen internally
on multifunction devices. The transaction may never be visible to the
upstream devices.
One upstream device that we particularly care about is the IOMMU. If
a redirection occurs in the topology below the IOMMU, then the IOMMU
cannot provide isolation between devices. This is why the PCIe spec
encourages topologies to include ACS support. Without it, we have to
assume peer-to-peer DMA within a hierarchy can bypass IOMMU isolation.
Unfortunately, far too many topologies do not support ACS to make this
a steadfast requirement. Even the latest chipsets from Intel are only
sporadically supporting ACS. We have trouble getting interconnect
vendors to include the PCIe spec required PCIe capability, let alone
suggested features.
Therefore, we need to add some flexibility. The pcie_acs_override=
boot option lets users opt-in specific devices or sets of devices to
assume ACS support. The "downstream" option assumes full ACS support
on root ports and downstream switch ports. The "multifunction"
option assumes the subset of ACS features available on multifunction
endpoints and upstream switch ports are supported. The "id:nnnn:nnnn"
option enables ACS support on devices matching the provided vendor
and device IDs, allowing more strategic ACS overrides. These options
may be combined in any order. A maximum of 16 id specific overrides
are available. It's suggested to use the most limited set of options
necessary to avoid completely disabling ACS across the topology.
Note to hardware vendors, we have facilities to permanently quirk
specific devices which enforce isolation but not provide an ACS
capability. Please contact me to have your devices added and save
your customers the hassle of this boot option.
Signed-off-by: Mark Weiman <mark.weiman@markzz.com>
---
.../admin-guide/kernel-parameters.txt | 9 ++
drivers/pci/quirks.c | 101 ++++++++++++++++++
2 files changed, 110 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index aefd358a5ca3..173b3596fd9e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3190,6 +3190,15 @@
nomsi [MSI] If the PCI_MSI kernel config parameter is
enabled, this kernel boot option can be used to
disable the use of MSI interrupts system-wide.
+ pcie_acs_override =
+ [PCIE] Override missing PCIe ACS support for:
+ downstream
+ All downstream ports - full ACS capabilities
+ multifunction
+ All multifunction devices - multifunction ACS subset
+ id:nnnn:nnnn
+ Specific device - full ACS capabilities
+ Specified as vid:did (vendor/device ID) in hex
noioapicquirk [APIC] Disable all boot interrupt quirks.
Safety option to keep boot IRQs enabled. This
should never be necessary.
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index 4700d24e5d55..8f7a3d7fd9c1 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3372,6 +3372,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET;
}
+static bool acs_on_downstream;
+static bool acs_on_multifunction;
+
+#define NUM_ACS_IDS 16
+struct acs_on_id {
+ unsigned short vendor;
+ unsigned short device;
+};
+static struct acs_on_id acs_on_ids[NUM_ACS_IDS];
+static u8 max_acs_id;
+
+static __init int pcie_acs_override_setup(char *p)
+{
+ if (!p)
+ return -EINVAL;
+
+ while (*p) {
+ if (!strncmp(p, "downstream", 10))
+ acs_on_downstream = true;
+ if (!strncmp(p, "multifunction", 13))
+ acs_on_multifunction = true;
+ if (!strncmp(p, "id:", 3)) {
+ char opt[5];
+ int ret;
+ long val;
+
+ if (max_acs_id >= NUM_ACS_IDS - 1) {
+ pr_warn("Out of PCIe ACS override slots (%d)\n",
+ NUM_ACS_IDS);
+ goto next;
+ }
+
+ p += 3;
+ snprintf(opt, 5, "%s", p);
+ ret = kstrtol(opt, 16, &val);
+ if (ret) {
+ pr_warn("PCIe ACS ID parse error %d\n", ret);
+ goto next;
+ }
+ acs_on_ids[max_acs_id].vendor = val;
+
+ p += strcspn(p, ":");
+ if (*p != ':') {
+ pr_warn("PCIe ACS invalid ID\n");
+ goto next;
+ }
+
+ p++;
+ snprintf(opt, 5, "%s", p);
+ ret = kstrtol(opt, 16, &val);
+ if (ret) {
+ pr_warn("PCIe ACS ID parse error %d\n", ret);
+ goto next;
+ }
+ acs_on_ids[max_acs_id].device = val;
+ max_acs_id++;
+ }
+next:
+ p += strcspn(p, ",");
+ if (*p == ',')
+ p++;
+ }
+
+ if (acs_on_downstream || acs_on_multifunction || max_acs_id)
+ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n");
+
+ return 0;
+}
+early_param("pcie_acs_override", pcie_acs_override_setup);
+
+static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags)
+{
+ int i;
+
+ /* Never override ACS for legacy devices or devices with ACS caps */
+ if (!pci_is_pcie(dev) ||
+ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS))
+ return -ENOTTY;
+
+ for (i = 0; i < max_acs_id; i++)
+ if (acs_on_ids[i].vendor == dev->vendor &&
+ acs_on_ids[i].device == dev->device)
+ return 1;
+
+ switch (pci_pcie_type(dev)) {
+ case PCI_EXP_TYPE_DOWNSTREAM:
+ case PCI_EXP_TYPE_ROOT_PORT:
+ if (acs_on_downstream)
+ return 1;
+ break;
+ case PCI_EXP_TYPE_ENDPOINT:
+ case PCI_EXP_TYPE_UPSTREAM:
+ case PCI_EXP_TYPE_LEG_END:
+ case PCI_EXP_TYPE_RC_END:
+ if (acs_on_multifunction && dev->multifunction)
+ return 1;
+ }
+
+ return -ENOTTY;
+}
/*
* Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset.
* The device will throw a Link Down error on AER-capable systems and
@@ -4513,6 +4613,7 @@ static const struct pci_dev_acs_enabled {
{ PCI_VENDOR_ID_ZHAOXIN, 0x9083, pci_quirk_mf_endpoint_acs },
/* Zhaoxin Root/Downstream Ports */
{ PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs },
+ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides },
{ 0 }
};

View File

@ -0,0 +1,165 @@
From b70e738f08403950aa3053c36b98c6b0eeb0eb90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
Date: Mon, 25 Oct 2021 09:49:42 -0300
Subject: [PATCH] futex: Add entry point for FUTEX_WAIT_MULTIPLE (opcode 31)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add an option to wait on multiple futexes using the old interface, that
uses opcode 31 through futex() syscall. Do that by just translation the
old interface to use the new code. This allows old and stable versions
of Proton to still use fsync in new kernel releases.
Signed-off-by: André Almeida <andrealmeid@collabora.com>
---
include/uapi/linux/futex.h | 12 ++++++
kernel/futex/core.c | 75 +++++++++++++++++++++++++++++++++++++-
2 files changed, 86 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 2a06b99f9803..417c5d89b745 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -22,6 +22,7 @@
#define FUTEX_WAIT_REQUEUE_PI 11
#define FUTEX_CMP_REQUEUE_PI 12
#define FUTEX_LOCK_PI2 13
+#define FUTEX_WAIT_MULTIPLE 31
#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CLOCK_REALTIME 256
@@ -68,6 +69,17 @@ struct futex_waitv {
__u32 __reserved;
};
+/**
+ * struct futex_wait_block - Block of futexes to be waited for
+ * @uaddr: User address of the futex
+ * @val: Futex value expected by userspace
+ * @bitset: Bitset for the optional bitmasked wakeup
+ */
+struct futex_wait_block {
+ __u32 __user *uaddr;
+ __u32 val;
+ __u32 bitset;
+};
/*
* Support for robust futexes: the kernel cleans up held futexes at
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 4a9e7ce3714a..c3f2e65afab8 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -4012,6 +4012,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd)
case FUTEX_LOCK_PI2:
case FUTEX_WAIT_BITSET:
case FUTEX_WAIT_REQUEUE_PI:
+ case FUTEX_WAIT_MULTIPLE:
return true;
}
return false;
@@ -4024,13 +4025,79 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
return -EINVAL;
*t = timespec64_to_ktime(*ts);
- if (cmd == FUTEX_WAIT)
+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE)
*t = ktime_add_safe(ktime_get(), *t);
else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
return 0;
}
+/**
+ * futex_read_wait_block - Read an array of futex_wait_block from userspace
+ * @uaddr: Userspace address of the block
+ * @count: Number of blocks to be read
+ *
+ * This function creates and allocate an array of futex_q (we zero it to
+ * initialize the fields) and then, for each futex_wait_block element from
+ * userspace, fill a futex_q element with proper values.
+ */
+inline struct futex_vector *futex_read_wait_block(u32 __user *uaddr, u32 count)
+{
+ unsigned int i;
+ struct futex_vector *futexv;
+ struct futex_wait_block fwb;
+ struct futex_wait_block __user *entry =
+ (struct futex_wait_block __user *)uaddr;
+
+ if (!count || count > FUTEX_WAITV_MAX)
+ return ERR_PTR(-EINVAL);
+
+ futexv = kcalloc(count, sizeof(*futexv), GFP_KERNEL);
+ if (!futexv)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < count; i++) {
+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) {
+ kfree(futexv);
+ return ERR_PTR(-EFAULT);
+ }
+
+ futexv[i].w.flags = FUTEX_32;
+ futexv[i].w.val = fwb.val;
+ futexv[i].w.uaddr = (uintptr_t) (fwb.uaddr);
+ futexv[i].q = futex_q_init;
+ }
+
+ return futexv;
+}
+
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to);
+
+int futex_opcode_31(ktime_t *abs_time, u32 __user *uaddr, int count)
+{
+ int ret;
+ struct futex_vector *vs;
+ struct hrtimer_sleeper *to = NULL, timeout;
+
+ to = futex_setup_timer(abs_time, &timeout, 0, 0);
+
+ vs = futex_read_wait_block(uaddr, count);
+
+ if (IS_ERR(vs))
+ return PTR_ERR(vs);
+
+ ret = futex_wait_multiple(vs, count, abs_time ? to : NULL);
+ kfree(vs);
+
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+
+ return ret;
+}
+
SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
const struct __kernel_timespec __user *, utime,
u32 __user *, uaddr2, u32, val3)
@@ -4050,6 +4117,9 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
tp = &t;
}
+ if (cmd == FUTEX_WAIT_MULTIPLE)
+ return futex_opcode_31(tp, uaddr, val);
+
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
@@ -4551,6 +4621,9 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
tp = &t;
}
+ if (cmd == FUTEX_WAIT_MULTIPLE)
+ return futex_opcode_31(tp, uaddr, val);
+
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#endif /* CONFIG_COMPAT_32BIT_TIME */
--
2.33.1

View File

@ -0,0 +1,166 @@
From b70e738f08403950aa3053c36b98c6b0eeb0eb90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Almeida?= <andrealmeid@collabora.com>
Date: Mon, 25 Oct 2021 09:49:42 -0300
Subject: [PATCH] futex: Add entry point for FUTEX_WAIT_MULTIPLE (opcode 31)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Add an option to wait on multiple futexes using the old interface, that
uses opcode 31 through futex() syscall. Do that by just translation the
old interface to use the new code. This allows old and stable versions
of Proton to still use fsync in new kernel releases.
Signed-off-by: André Almeida <andrealmeid@collabora.com>
---
include/uapi/linux/futex.h | 13 +++++++
kernel/futex/syscalls.c | 75 +++++++++++++++++++++++++++++++++++++-
2 files changed, 87 insertions(+), 1 deletion(-)
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index 71a5df8d2689..d375ab21cbf8 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -22,6 +22,7 @@
#define FUTEX_WAIT_REQUEUE_PI 11
#define FUTEX_CMP_REQUEUE_PI 12
#define FUTEX_LOCK_PI2 13
+#define FUTEX_WAIT_MULTIPLE 31
#define FUTEX_PRIVATE_FLAG 128
#define FUTEX_CLOCK_REALTIME 256
@@ -68,6 +69,18 @@ struct futex_waitv {
__u32 __reserved;
};
+/**
+ * struct futex_wait_block - Block of futexes to be waited for
+ * @uaddr: User address of the futex
+ * @val: Futex value expected by userspace
+ * @bitset: Bitset for the optional bitmasked wakeup
+ */
+struct futex_wait_block {
+ __u32 __user *uaddr;
+ __u32 val;
+ __u32 bitset;
+};
+
/*
* Support for robust futexes: the kernel cleans up held futexes at
* thread exit time.
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index 6f91a07a6a83..2f4d4c04ede2 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -158,6 +158,7 @@ static __always_inline bool futex_cmd_has_timeout(u32 cmd)
case FUTEX_LOCK_PI2:
case FUTEX_WAIT_BITSET:
case FUTEX_WAIT_REQUEUE_PI:
+ case FUTEX_WAIT_MULTIPLE:
return true;
}
return false;
@@ -170,13 +171,79 @@ futex_init_timeout(u32 cmd, u32 op, struct timespec64 *ts, ktime_t *t)
return -EINVAL;
*t = timespec64_to_ktime(*ts);
- if (cmd == FUTEX_WAIT)
+ if (cmd == FUTEX_WAIT || cmd == FUTEX_WAIT_MULTIPLE)
*t = ktime_add_safe(ktime_get(), *t);
else if (cmd != FUTEX_LOCK_PI && !(op & FUTEX_CLOCK_REALTIME))
*t = timens_ktime_to_host(CLOCK_MONOTONIC, *t);
return 0;
}
+/**
+ * futex_read_wait_block - Read an array of futex_wait_block from userspace
+ * @uaddr: Userspace address of the block
+ * @count: Number of blocks to be read
+ *
+ * This function creates and allocate an array of futex_q (we zero it to
+ * initialize the fields) and then, for each futex_wait_block element from
+ * userspace, fill a futex_q element with proper values.
+ */
+inline struct futex_vector *futex_read_wait_block(u32 __user *uaddr, u32 count)
+{
+ unsigned int i;
+ struct futex_vector *futexv;
+ struct futex_wait_block fwb;
+ struct futex_wait_block __user *entry =
+ (struct futex_wait_block __user *)uaddr;
+
+ if (!count || count > FUTEX_WAITV_MAX)
+ return ERR_PTR(-EINVAL);
+
+ futexv = kcalloc(count, sizeof(*futexv), GFP_KERNEL);
+ if (!futexv)
+ return ERR_PTR(-ENOMEM);
+
+ for (i = 0; i < count; i++) {
+ if (copy_from_user(&fwb, &entry[i], sizeof(fwb))) {
+ kfree(futexv);
+ return ERR_PTR(-EFAULT);
+ }
+
+ futexv[i].w.flags = FUTEX_32;
+ futexv[i].w.val = fwb.val;
+ futexv[i].w.uaddr = (uintptr_t) (fwb.uaddr);
+ futexv[i].q = futex_q_init;
+ }
+
+ return futexv;
+}
+
+int futex_wait_multiple(struct futex_vector *vs, unsigned int count,
+ struct hrtimer_sleeper *to);
+
+int futex_opcode_31(ktime_t *abs_time, u32 __user *uaddr, int count)
+{
+ int ret;
+ struct futex_vector *vs;
+ struct hrtimer_sleeper *to = NULL, timeout;
+
+ to = futex_setup_timer(abs_time, &timeout, 0, 0);
+
+ vs = futex_read_wait_block(uaddr, count);
+
+ if (IS_ERR(vs))
+ return PTR_ERR(vs);
+
+ ret = futex_wait_multiple(vs, count, abs_time ? to : NULL);
+ kfree(vs);
+
+ if (to) {
+ hrtimer_cancel(&to->timer);
+ destroy_hrtimer_on_stack(&to->timer);
+ }
+
+ return ret;
+}
+
SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
const struct __kernel_timespec __user *, utime,
u32 __user *, uaddr2, u32, val3)
@@ -196,6 +263,9 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
tp = &t;
}
+ if (cmd == FUTEX_WAIT_MULTIPLE)
+ return futex_opcode_31(tp, uaddr, val);
+
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
@@ -392,6 +462,9 @@ SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val,
tp = &t;
}
+ if (cmd == FUTEX_WAIT_MULTIPLE)
+ return futex_opcode_31(tp, uaddr, val);
+
return do_futex(uaddr, op, val, tp, uaddr2, (unsigned long)utime, val3);
}
#endif /* CONFIG_COMPAT_32BIT_TIME */
--
2.33.1

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,189 @@
From e5e77ad2223f662e1615266d8ef39a8db7e65a70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20H=C3=A4dicke?= <felixhaedicke@web.de>
Date: Thu, 19 Nov 2020 09:22:32 +0100
Subject: HID: quirks: Add Apple Magic Trackpad 2 to hid_have_special_driver
list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
The Apple Magic Trackpad 2 is handled by the magicmouse driver. And
there were severe stability issues when both drivers (hid-generic and
hid-magicmouse) were loaded for this device.
Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=210241
Signed-off-by: Felix Hädicke <felixhaedicke@web.de>
---
drivers/hid/hid-quirks.c | 2 ++
1 file changed, 2 insertions(+)
diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c
index bf7ecab5d9e5..142e9dae2837 100644
--- a/drivers/hid/hid-quirks.c
+++ b/drivers/hid/hid-quirks.c
@@ -478,6 +478,8 @@ static const struct hid_device_id hid_have_special_driver[] = {
#if IS_ENABLED(CONFIG_HID_MAGICMOUSE)
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICMOUSE) },
{ HID_BLUETOOTH_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD) },
+ { HID_BLUETOOTH_DEVICE(BT_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) },
+ { HID_USB_DEVICE(USB_VENDOR_ID_APPLE, USB_DEVICE_ID_APPLE_MAGICTRACKPAD2) },
#endif
#if IS_ENABLED(CONFIG_HID_MAYFLASH)
{ HID_USB_DEVICE(USB_VENDOR_ID_DRAGONRISE, USB_DEVICE_ID_DRAGONRISE_PS3) },
--
cgit v1.2.3-1-gf6bb5
From e437ac931e89629f952ce9f3f9dfe45ac505cd0d Mon Sep 17 00:00:00 2001
From: Joshua Ashton <joshua@froggi.es>
Date: Tue, 5 Jan 2021 19:46:01 +0000
Subject: [PATCH] drm/amdgpu: don't limit gtt size on apus
Since commit 24562523688b ("Revert "drm/amd/amdgpu: set gtt size
according to system memory size only""), the GTT size was limited by
3GiB or VRAM size.
This is problematic on APU systems with a small carveout
(notably, those that ship with dGPUs where this is unconfigurable),
where the carveout size can be as low as 128MiB.
This makes it so the GTT size heuristic always uses 3/4ths of
the system memory size on APUs (limiting the size by 3GiB/VRAM size
only on devices with dedicated video memory).
Fixes: 24562523688b ("Revert drm/amd/amdgpu: set gtt size according to
system memory size only")
Signed-off-by: Joshua Ashton <joshua@froggi.es>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 5 +++--
drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 12 +++++++++---
2 files changed, 12 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 72efd579ec5e..a5a41e9272d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -192,8 +192,9 @@ module_param_named(gartsize, amdgpu_gart_size, uint, 0600);
/**
* DOC: gttsize (int)
- * Restrict the size of GTT domain in MiB for testing. The default is -1 (It's VRAM size if 3GB < VRAM < 3/4 RAM,
- * otherwise 3/4 RAM size).
+ * Restrict the size of GTT domain in MiB for testing. The default is -1 (On APUs this is 3/4th
+ * of the system memory; on dGPUs this is 3GiB or VRAM sized, whichever is bigger,
+ * with an upper bound of 3/4th of system memory.
*/
MODULE_PARM_DESC(gttsize, "Size of the GTT domain in megabytes (-1 = auto)");
module_param_named(gttsize, amdgpu_gtt_size, int, 0600);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 4d8f19ab1014..294f26f4f310 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1865,9 +1865,15 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
struct sysinfo si;
si_meminfo(&si);
- gtt_size = min(max((AMDGPU_DEFAULT_GTT_SIZE_MB << 20),
- adev->gmc.mc_vram_size),
- ((uint64_t)si.totalram * si.mem_unit * 3/4));
+ gtt_size = (uint64_t)si.totalram * si.mem_unit * 3/4;
+ /* If we have dedicated memory, limit our GTT size to
+ * 3GiB or VRAM size, whichever is bigger
+ */
+ if (!(adev->flags & AMD_IS_APU)) {
+ gtt_size = min(max(AMDGPU_DEFAULT_GTT_SIZE_MB << 20,
+ adev->gmc.mc_vram_size),
+ gtt_size);
+ }
}
else
gtt_size = (uint64_t)amdgpu_gtt_size << 20;
--
2.30.0
From f7f49141a5dbe9c99d78196b58c44307fb2e6be3 Mon Sep 17 00:00:00 2001
From: Tk-Glitch <ti3nou@gmail.com>
Date: Wed, 3 Feb 2021 11:20:12 +0200
Subject: Revert "cpufreq: Avoid configuring old governors as default with intel_pstate"
This is an undesirable behavior for us since our aggressive ondemand performs
better than schedutil for gaming when using intel_pstate in passive mode.
Also it interferes with the option to select the desired default governor we have.
diff --git a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
index 2c7171e0b0010..85de313ddec29 100644
--- a/drivers/cpufreq/Kconfig
+++ b/drivers/cpufreq/Kconfig
@@ -71,7 +71,6 @@ config CPU_FREQ_DEFAULT_GOV_USERSPACE
config CPU_FREQ_DEFAULT_GOV_ONDEMAND
bool "ondemand"
- depends on !(X86_INTEL_PSTATE && SMP)
select CPU_FREQ_GOV_ONDEMAND
select CPU_FREQ_GOV_PERFORMANCE
help
@@ -83,7 +84,6 @@ config CPU_FREQ_DEFAULT_GOV_ONDEMAND
config CPU_FREQ_DEFAULT_GOV_CONSERVATIVE
bool "conservative"
- depends on !(X86_INTEL_PSTATE && SMP)
select CPU_FREQ_GOV_CONSERVATIVE
select CPU_FREQ_GOV_PERFORMANCE
help
https://lore.kernel.org/lkml/20210819004305.20203-1-deepak.sharma@amd.com/
From: Deepak Sharma <deepak.sharma@amd.com>
To: <deepak.sharma@amd.com>
Cc: "Rafael J. Wysocki" <rjw@rjwysocki.net>,
Len Brown <len.brown@intel.com>, Pavel Machek <pavel@ucw.cz>,
Thomas Gleixner <tglx@linutronix.de>,
"Ingo Molnar" <mingo@redhat.com>, Borislav Petkov <bp@alien8.de>,
"maintainer:X86 ARCHITECTURE (32-BIT AND 64-BIT)"
<x86@kernel.org>, "H. Peter Anvin" <hpa@zytor.com>,
"open list:SUSPEND TO RAM" <linux-pm@vger.kernel.org>,
"open list:X86 ARCHITECTURE (32-BIT AND 64-BIT)"
<linux-kernel@vger.kernel.org>
Subject: [PATCH] x86/ACPI/State: Optimize C3 entry on AMD CPUs
Date: Wed, 18 Aug 2021 17:43:05 -0700
Message-ID: <20210819004305.20203-1-deepak.sharma@amd.com> (raw)
AMD CPU which support C3 shares cache. Its not necessary to flush the
caches in software before entering C3. This will cause performance drop
for the cores which share some caches. ARB_DIS is not used with current
AMD C state implementation. So set related flags correctly.
Signed-off-by: Deepak Sharma <deepak.sharma@amd.com>
---
arch/x86/kernel/acpi/cstate.c | 15 +++++++++++++++
1 file changed, 15 insertions(+)
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 7de599eba7f0..62a5986d625a 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -79,6 +79,21 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
*/
flags->bm_control = 0;
}
+ if (c->x86_vendor == X86_VENDOR_AMD) {
+ /*
+ * For all AMD CPUs that support C3, caches should not be
+ * flushed by software while entering C3 type state. Set
+ * bm->check to 1 so that kernel doesn't need to execute
+ * cache flush operation.
+ */
+ flags->bm_check = 1;
+ /*
+ * In current AMD C state implementation ARB_DIS is no longer
+ * used. So set bm_control to zero to indicate ARB_DIS is not
+ * required while entering C3 type state.
+ */
+ flags->bm_control = 0;
+ }
}
EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
--
2.25.1

View File

@ -0,0 +1,14 @@
diff --git a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
index edd21d14e64f..793bd586b80b 100644
--- a/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shrinker.c
@@ -357,8 +357,7 @@ i915_gem_shrinker_scan(struct shrinker *shrinker, struct shrink_control *sc)
sc->nr_to_scan,
&sc->nr_scanned,
I915_SHRINK_BOUND |
- I915_SHRINK_UNBOUND |
- I915_SHRINK_WRITEBACK);
+ I915_SHRINK_UNBOUND);
if (sc->nr_scanned < sc->nr_to_scan && current_is_kswapd()) {
intel_wakeref_t wakeref;