Skip to content

Commit 093ae7a

Browse files
Xavier Xiactmarinas
authored andcommitted
arm64/mm: Optimize loop to reduce redundant operations of contpte_ptep_get
This commit optimizes the contpte_ptep_get and contpte_ptep_get_lockless function by adding early termination logic. It checks if the dirty and young bits of orig_pte are already set and skips redundant bit-setting operations during the loop. This reduces unnecessary iterations and improves performance. In order to verify the optimization performance, a test function has been designed. The function's execution time and instruction statistics have been traced using perf, and the following are the operation results on a certain Qualcomm mobile phone chip: Test Code: #include <stdlib.h> #include <sys/mman.h> #include <stdio.h> #define PAGE_SIZE 4096 #define CONT_PTES 16 #define TEST_SIZE (4096* CONT_PTES * PAGE_SIZE) #define YOUNG_BIT 8 void rwdata(char *buf) { for (size_t i = 0; i < TEST_SIZE; i += PAGE_SIZE) { buf[i] = 'a'; volatile char c = buf[i]; } } void clear_young_dirty(char *buf) { if (madvise(buf, TEST_SIZE, MADV_FREE) == -1) { perror("madvise free failed"); free(buf); exit(EXIT_FAILURE); } if (madvise(buf, TEST_SIZE, MADV_COLD) == -1) { perror("madvise free failed"); free(buf); exit(EXIT_FAILURE); } } void set_one_young(char *buf) { for (size_t i = 0; i < TEST_SIZE; i += CONT_PTES * PAGE_SIZE) { volatile char c = buf[i + YOUNG_BIT * PAGE_SIZE]; } } void test_contpte_perf() { char *buf; int ret = posix_memalign((void **)&buf, CONT_PTES * PAGE_SIZE, TEST_SIZE); if ((ret != 0) || ((unsigned long)buf % CONT_PTES * PAGE_SIZE)) { perror("posix_memalign failed"); exit(EXIT_FAILURE); } rwdata(buf); #if TEST_CASE2 || TEST_CASE3 clear_young_dirty(buf); #endif #if TEST_CASE2 set_one_young(buf); #endif for (int j = 0; j < 500; j++) { mlock(buf, TEST_SIZE); munlock(buf, TEST_SIZE); } free(buf); } int main(void) { test_contpte_perf(); return 0; } Descriptions of three test scenarios Scenario 1 The data of all 16 PTEs are both dirty and young. #define TEST_CASE2 0 #define TEST_CASE3 0 Scenario 2 Among the 16 PTEs, only the 8th one is young, and there are no dirty ones. #define TEST_CASE2 1 #define TEST_CASE3 0 Scenario 3 Among the 16 PTEs, there are neither young nor dirty ones. #define TEST_CASE2 0 #define TEST_CASE3 1 Test results |Scenario 1 | Original| Optimized| |-------------------|---------------|----------------| |instructions | 37912436160| 18731580031| |test time | 4.2797| 2.2949| |overhead of | | | |contpte_ptep_get() | 21.31%| 4.80%| |Scenario 2 | Original| Optimized| |-------------------|---------------|----------------| |instructions | 36701270862| 36115790086| |test time | 3.2335| 3.0874| |Overhead of | | | |contpte_ptep_get() | 32.26%| 33.57%| |Scenario 3 | Original| Optimized| |-------------------|---------------|----------------| |instructions | 36706279735| 36750881878| |test time | 3.2008| 3.1249| |Overhead of | | | |contpte_ptep_get() | 31.94%| 34.59%| For Scenario 1, optimized code can achieve an instruction benefit of 50.59% and a time benefit of 46.38%. For Scenario 2, optimized code can achieve an instruction count benefit of 1.6% and a time benefit of 4.5%. For Scenario 3, since all the PTEs have neither the young nor the dirty flag, the branches taken by optimized code should be the same as those of the original code. In fact, the test results of optimized code seem to be closer to those of the original code. Ryan re-ran these tests on Apple M2 with 4K base pages + 64K mTHP. Scenario 1: reduced to 56% of baseline execution time Scenario 2: reduced to 89% of baseline execution time Scenario 3: reduced to 91% of baseline execution time It can be proven through test function that the optimization for contpte_ptep_get is effective. Since the logic of contpte_ptep_get_lockless is similar to that of contpte_ptep_get, the same optimization scheme is also adopted for it. Reviewed-by: Ryan Roberts <[email protected]> Tested-by: Ryan Roberts <[email protected]> Reviewed-by: Barry Song <[email protected]> Signed-off-by: Xavier Xia <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Catalin Marinas <[email protected]>
1 parent 6853acd commit 093ae7a

File tree

1 file changed

+64
-10
lines changed

1 file changed

+64
-10
lines changed

arch/arm64/mm/contpte.c

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -169,17 +169,46 @@ pte_t contpte_ptep_get(pte_t *ptep, pte_t orig_pte)
169169
for (i = 0; i < CONT_PTES; i++, ptep++) {
170170
pte = __ptep_get(ptep);
171171

172-
if (pte_dirty(pte))
172+
if (pte_dirty(pte)) {
173173
orig_pte = pte_mkdirty(orig_pte);
174-
175-
if (pte_young(pte))
174+
for (; i < CONT_PTES; i++, ptep++) {
175+
pte = __ptep_get(ptep);
176+
if (pte_young(pte)) {
177+
orig_pte = pte_mkyoung(orig_pte);
178+
break;
179+
}
180+
}
181+
break;
182+
}
183+
184+
if (pte_young(pte)) {
176185
orig_pte = pte_mkyoung(orig_pte);
186+
i++;
187+
ptep++;
188+
for (; i < CONT_PTES; i++, ptep++) {
189+
pte = __ptep_get(ptep);
190+
if (pte_dirty(pte)) {
191+
orig_pte = pte_mkdirty(orig_pte);
192+
break;
193+
}
194+
}
195+
break;
196+
}
177197
}
178198

179199
return orig_pte;
180200
}
181201
EXPORT_SYMBOL_GPL(contpte_ptep_get);
182202

203+
static inline bool contpte_is_consistent(pte_t pte, unsigned long pfn,
204+
pgprot_t orig_prot)
205+
{
206+
pgprot_t prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
207+
208+
return pte_valid_cont(pte) && pte_pfn(pte) == pfn &&
209+
pgprot_val(prot) == pgprot_val(orig_prot);
210+
}
211+
183212
pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
184213
{
185214
/*
@@ -202,7 +231,6 @@ pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
202231
pgprot_t orig_prot;
203232
unsigned long pfn;
204233
pte_t orig_pte;
205-
pgprot_t prot;
206234
pte_t *ptep;
207235
pte_t pte;
208236
int i;
@@ -219,18 +247,44 @@ pte_t contpte_ptep_get_lockless(pte_t *orig_ptep)
219247

220248
for (i = 0; i < CONT_PTES; i++, ptep++, pfn++) {
221249
pte = __ptep_get(ptep);
222-
prot = pte_pgprot(pte_mkold(pte_mkclean(pte)));
223250

224-
if (!pte_valid_cont(pte) ||
225-
pte_pfn(pte) != pfn ||
226-
pgprot_val(prot) != pgprot_val(orig_prot))
251+
if (!contpte_is_consistent(pte, pfn, orig_prot))
227252
goto retry;
228253

229-
if (pte_dirty(pte))
254+
if (pte_dirty(pte)) {
230255
orig_pte = pte_mkdirty(orig_pte);
256+
for (; i < CONT_PTES; i++, ptep++, pfn++) {
257+
pte = __ptep_get(ptep);
258+
259+
if (!contpte_is_consistent(pte, pfn, orig_prot))
260+
goto retry;
261+
262+
if (pte_young(pte)) {
263+
orig_pte = pte_mkyoung(orig_pte);
264+
break;
265+
}
266+
}
267+
break;
268+
}
231269

232-
if (pte_young(pte))
270+
if (pte_young(pte)) {
233271
orig_pte = pte_mkyoung(orig_pte);
272+
i++;
273+
ptep++;
274+
pfn++;
275+
for (; i < CONT_PTES; i++, ptep++, pfn++) {
276+
pte = __ptep_get(ptep);
277+
278+
if (!contpte_is_consistent(pte, pfn, orig_prot))
279+
goto retry;
280+
281+
if (pte_dirty(pte)) {
282+
orig_pte = pte_mkdirty(orig_pte);
283+
break;
284+
}
285+
}
286+
break;
287+
}
234288
}
235289

236290
return orig_pte;

0 commit comments

Comments
 (0)