Skip to content

Commit 6520578

Browse files
authored
Merge pull request #23 from adam-p/init-reset
Add forceReset flag to Init()
2 parents 22aec40 + bfdc79e commit 6520578

27 files changed

+435
-63
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ captures/
6464

6565
# External native build folder generated in Android Studio 2.2 and later
6666
.externalNativeBuild
67+
.cxx
6768

6869
# Google Services (e.g. APIs or Firebase)
6970
google-services.json

.idea/codeStyles/Project.xml

Lines changed: 109 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ buildscript {
77
jcenter()
88
}
99
dependencies {
10-
classpath 'com.android.tools.build:gradle:3.4.1'
10+
classpath 'com.android.tools.build:gradle:3.5.0'
1111

1212

1313
// NOTE: Do not place your application dependencies here; they belong

concurrency.md

Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
1. go back to original code and try it with initial Refresh (current test code) -- if this succeeds, then there's probably a simultaneous-NewTracker problem in released code
2+
2. go back to sync'd code and try it with initial Refresh (current test code)
3+
3. Try with GetEnv and stored this_obj
4+
5+
* NewTracker is never going to be okay with multiple concurrent calls -- tokens get reset
6+
7+
8+
---
9+
10+
There are three unrelated problems in the PsiCash library (core and Android) resulting from concurrent operations. We'll call them the "JNIEnv problem", the "NewTracker problem", and the "Datastore Pauser problem".
11+
12+
## Problem manifestation
13+
14+
Crashes that look like this:
15+
16+
```
17+
pid: 7314, tid: 7336, name: Thread-3 >>> ca.psiphon.psicashlib.test <<<
18+
signal 6 (SIGABRT), code -6 (SI_TKILL), fault addr --------
19+
Abort message: 'java_vm_ext.cc:542] JNI DETECTED ERROR IN APPLICATION: thread Thread[16,tid=7336,Native,Thread*=0xdbfdb400,peer=0x175e2328,"Thread-3"] using JNIEnv* from thread Thread[17,tid=7337,Runnable,Thread*=0xdbfdba00,peer=0x175e2480,"Thread-4"]'
20+
eax 00000000 ebx 00001c92 ecx 00001ca8 edx 00000006
21+
edi 00001c92 esi 00000236
22+
ebp 000024e1 esp c9078228 eip e92fdb39
23+
backtrace:
24+
#00 pc 00000b39 [vdso:e92fd000] (__kernel_vsyscall+9)
25+
#01 pc 0001fdf8 /system/lib/libc.so (syscall+40)
26+
#02 pc 00022ed3 /system/lib/libc.so (abort+115)
27+
#03 pc 004dc8a6 /system/lib/libart.so (art::Runtime::Abort(char const*)+1174)
28+
#04 pc 005cd833 /system/lib/libart.so (_ZNSt3__110__function6__funcIPFvPKcENS_9allocatorIS5_EES4_EclEOS3_+35)
29+
#05 pc 00007ccc /system/lib/libbase.so (android::base::LogMessage::~LogMessage()+828)
30+
#06 pc 0031ab5f /system/lib/libart.so (art::JavaVMExt::JniAbort(char const*, char const*)+1775)
31+
#07 pc 0031ad21 /system/lib/libart.so (art::JavaVMExt::JniAbortV(char const*, char const*, char*)+113)
32+
#08 pc 000d60f7 /system/lib/libart.so (art::(anonymous namespace)::ScopedCheck::AbortF(char const*, ...)+71)
33+
#09 pc 000d46fc /system/lib/libart.so (art::(anonymous namespace)::ScopedCheck::CheckPossibleHeapValue(art::ScopedObjectAccess&, char, art::(anonymous namespace)::JniValueType)+364)
34+
#10 pc 000d3bdb /system/lib/libart.so (art::(anonymous namespace)::ScopedCheck::Check(art::ScopedObjectAccess&, bool, char const*, art::(anonymous namespace)::JniValueType*)+811)
35+
#11 pc 000de1b4 /system/lib/libart.so (art::(anonymous namespace)::CheckJNI::GetStringCharsInternal(char const*, _JNIEnv*, _jstring*, unsigned char*, bool, bool)+916)
36+
#12 pc 000c951d /system/lib/libart.so (art::(anonymous namespace)::CheckJNI::GetStringUTFChars(_JNIEnv*, _jstring*, unsigned char*)+45)
37+
#13 pc 000ddd14 /data/app/ca.psiphon.psicashlib.test-DHndXih0EUfYYycM0w4HJQ==/lib/x86/libpsicash.so (_JNIEnv::GetStringUTFChars(_jstring*, unsigned char*)+100)
38+
#14 pc 000dd7d4 /data/app/ca.psiphon.psicashlib.test-DHndXih0EUfYYycM0w4HJQ==/lib/x86/libpsicash.so (JStringToString(_JNIEnv*, _jstring*)+132)
39+
#15 pc 000e8013 /data/app/ca.psiphon.psicashlib.test-DHndXih0EUfYYycM0w4HJQ==/lib/x86/libpsicash.so
40+
#16 pc 000e47cc /data/app/ca.psiphon.psicashlib.test-DHndXih0EUfYYycM0w4HJQ==/lib/x86/libpsicash.so
41+
#17 pc 000e4637 /data/app/ca.psiphon.psicashlib.test-DHndXih0EUfYYycM0w4HJQ==/lib/x86/libpsicash.so
42+
#18 pc 0016f1a5 /data/app/ca.psiphon.psicashlib.test-DHndXih0EUfYYycM0w4HJQ==/lib/x86/libpsicash.so (std::__ndk1::function<psicash::HTTPResult (psicash::HTTPParams const&)>::operator()(psicash::HTTPParams const&) const+229)
43+
#19 pc 0016cf47 /data/app/ca.psiphon.psicashlib.test-DHndXih0EUfYYycM0w4HJQ==/lib/x86/libpsicash.so (psicash::PsiCash::MakeHTTPRequestWithRetry(std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char>> const&, std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char>> const&, bool, std::__ndk1::vector<std::__ndk1::pair<std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char>>, std::__ndk1::basic_
44+
#20 pc 00178986 /data/app/ca.psiphon.psicashlib.test-DHndXih0EUfYYycM0w4HJQ==/lib/x86/libpsicash.so (psicash::PsiCash::RefreshState(std::__ndk1::vector<std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char>>, std::__ndk1::allocator<std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char>>>> const&, bool)+4662)
45+
#21 pc 00177734 /data/app/ca.psiphon.psicashlib.test-DHndXih0EUfYYycM0w4HJQ==/lib/x86/libpsicash.so (psicash::PsiCash::RefreshState(std::__ndk1::vector<std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char>>, std::__ndk1::allocator<std::__ndk1::basic_string<char, std::__ndk1::char_traits<char>, std::__ndk1::allocator<char>>>> const&)+84)
46+
#22 pc 000acebf /data/app/ca.psiphon.psicashlib.test-DHndXih0EUfYYycM0w4HJQ==/lib/x86/libpsicash.so (Java_ca_psiphon_psicashlib_PsiCashLib_NativeRefreshState+1295)
47+
#23 pc 005f6b97 /system/lib/libart.so (art_quick_generic_jni_trampoline+71)
48+
```
49+
50+
That is resulting from the JNIEnv problem. I only found the other problems when trying out JNIEnv problem reproduction and fixes.
51+
52+
## JNIEnv problem
53+
54+
This is the code flow for a PsiCash Android lib call that has a network request (like RefreshState).
55+
56+
```no-highlight
57+
App Java App HTTP requester
58+
⇅ ⇅
59+
Lib Java Lib Java
60+
⇅ ⇅
61+
C++ JNI glue [1] JNI glue [2]
62+
⇅ ⇅
63+
C++ core lib ⇄ need HTTP req
64+
```
65+
66+
The "App HTTP Requester" is a helper function supplied and set by the app during initialization. Designing its use, I didn't realize that some scenarios (like Android) would require that state present at the beginning of the call (`[1]` above) would be needed when processing the request (`[2]` above), so I didn't provide a way to pass state through the (JNI-ignorant) C++ core code into the HTTP requester.
67+
68+
The state that's needed is the `JNIEnv` instance pointer and the `jobject` which is the `PsiCashLib` Java object instance. `jobject` is shareable between threads (I think, at this point), but `JNIEnv` is not.
69+
70+
I did the state passing in a [super dumb way](https://github.com/Psiphon-Inc/psicash-lib-android/blob/3865fc1860e2a106defcdbc0e150973ff3188d01/psicashlib/src/main/cpp/jniglue.cpp#L324-L329):
71+
```cpp
72+
extern "C" JNIEXPORT jstring
73+
JNICALL
74+
Java_ca_psiphon_psicashlib_PsiCashLib_NativeRefreshState(
75+
JNIEnv* env,
76+
jobject this_obj,
77+
jobjectArray j_purchase_classes) {
78+
79+
... j_purchase_classes --> purchase_classes
80+
81+
GetPsiCash().SetHTTPRequestFn(GetHTTPReqFn(env, this_obj));
82+
83+
auto result = GetPsiCash().RefreshState(purchase_classes);
84+
if (!result) {
85+
return JNI_(WRAP_ERROR(result.error()));
86+
}
87+
88+
return JNI_s(SuccessResponse(*result));
89+
}
90+
```
91+
92+
That effectively creates an HTTP requester function that encloses the `JNIEnv` and `jobject` passed into the call, and sets it as _the_ requester function for the library.
93+
94+
But... if another RefreshState (or NewTransaction) call happens from another thread while the initial call is in progress... a different enclosing function is set as _the_ requester, enclosing the JNIEnv for a different thread. When an attempt is made to access the wrong JNIEnv from the wrong thread... the crash you see above occurs.
95+
96+
### Why didn't it manifest before?
97+
98+
When all the Java lib calls were `synchronized` or read-write-locked, there were no concurrent network calls. (Maybe I had that in mind when I wrote it so dumb? Maybe!)
99+
100+
This is the same answer to the same questions for the other problems as well.
101+
102+
103+
## NewTracker problem
104+
105+
This should probably be called the "first RefreshState and so NewTracker is called" problem.
106+
107+
RefreshState's logic is something like this:
108+
1. If there are no tokens yet, call NewTracker (`/tracker`) and recurse.
109+
- NewTracker gets new tokens, set them and clears the balance in the datastore.
110+
2. Make a `/refresh-state` request to the server, passing tokens.
111+
3. Get back the balance and purchase prices and set them in the datastore.
112+
113+
If different threads concurrently call RefreshState for the first time, then the Tracker may get created twice and different tokens used for the final `/refresh-state` req than were obtained from the final `/tracker` req.
114+
115+
### NewExpiringPurchase/NewTransaction variant
116+
117+
NewExpiringPurchase calls should also not be run concurrently with each other or with RefreshState. We could end up storing a balance that's from an earlier operation, whereas the true final balance is from a later operation.
118+
119+
120+
## Datastore Pauser problem
121+
122+
This might be due to another concurrency effect here: datastore "write pausing". In order to avoid lots of disk writes for little bits of data that will arrive together, the datastore writing can be paused and resumed -- so, pause, set a bunch of values, then unpause to write them. The pauser is just a simple off/on mechanism, [like so](https://github.com/Psiphon-Inc/psicash-lib-core/blob/d9592e7549dace16ec21566f137f6bd4e81c7b1a/datastore.cpp#L48-L60):
123+
124+
```cpp
125+
void Datastore::PauseWrites() {
126+
SYNCHRONIZE(mutex_);
127+
paused_ = true;
128+
}
129+
130+
error::Error Datastore::UnpauseWrites() {
131+
SYNCHRONIZE(mutex_);
132+
if (!paused_) {
133+
return nullerr;
134+
}
135+
paused_ = false;
136+
return FileStore();
137+
}
138+
```
139+
140+
So if multiple threads try to pause and unpause writes at the same time... it's not great. It shouldn't actually cause bad behaviour, since this is what will happen:
141+
142+
1. Thread A pauses writes.
143+
2. Thread B pauses writes.
144+
3. Thread A sets XA, which doesn't go out to disk yet.
145+
4. Thread A sets YA, which doesn't go out to disk yet.
146+
5. Thread B sets XB, which doesn't go out to disk yet.
147+
6. Thread A unpauses, causing write to happen.
148+
7. Thread B sets YB, which writes to disk immediately.
149+
8. Thread B unpauses, which returns an error, which is ignored by [`UserData::WritePauser` dtor](https://github.com/Psiphon-Inc/psicash-lib-core/blob/d9592e7549dace16ec21566f137f6bd4e81c7b1a/userdata.hpp#L57).
150+
151+
The disk thrashing that we wanted to avoid isn't avoided, but the data that ends up on disk isn't incorrect.
152+
153+
## Solution(s)
154+
155+
There are going to be multiple parts to this fix, and probably short and long term parts.
156+
157+
### NewTracker (and NewExpiringPurchase) problem
158+
159+
RefreshState and NewExpiringPurchase need to happen one-at-a-time. So, there needs to be a mutex controlling access. In the long term, I want that mutex to be in the core library, as this problem is shared across platforms. In the short term, maybe it can be in the Java or JNI glue.
160+
161+
### JNIEnv problem
162+
163+
In the short term, mutexing RefreshState and NewExpiringPurchase in Java or JNI negates this problem, as the enclosed values will be valid for the duration of the calls.
164+
165+
In the long term, this should be fixed properly. A few ideas:
166+
167+
1. If the jobject is okay to share between threads, then the JNIEnv for the current thread can be retrieved using `JavaVM::GetEnv` from within the HTTP requester and not much else needs to change.
168+
169+
2. The enclosed HTTP request function could be passed into RefreshState and NewExpiringPurchase for later use.
170+
171+
3. Support could be added for an opaque object (containing jobject and JNIEnv) to be passed through RefreshState and NewExpiringPurchase into the HTTP requester.
172+
173+
I think that's my preferred order.
174+
175+
### Datastore Pauser problem
176+
177+
In the short term, mutexing RefreshState and NewExpiringPurchase in Java or JNI negates this problem, as those are the only places write pausers are used.
178+
179+
In the long term... Maybe use a count up/down when pausing and unpausing? Except if they're happening simultaneously then there's something else wrong (like concurrent RefreshState calls). So maybe there's no point in a fix for this.
180+
181+
Maybe it should just assert if this condition is hit.
182+
183+
### Mutex usage elsewhere
184+
185+
In a previous rev, _all_ PsiCashLib methods were `synchronized`; in a rev after that, they _all_ used read/write locks.
186+
187+
We then rationalized that just mutexing the datastore reads and writes was sufficient, and that our thread-safety guarantees would only be "data won't be corrupt".
188+
189+
I think our new philosophy should be similar: "Writes are mutexed at the method level, reads have no special mutexes; all datastore read/writes are mutexed (to avoid corruption)". Methods that write are: init, setRequestMetadataItem, expirePurchases, removePurchases, refreshState, newExpiringPurchase.
190+
191+
### Short-term fix summary
192+
193+
Mutex the write operations in Java.
194+
195+
The easiest thing would be to just add `synchronized` to them. But Eugene has seen some ANR weirdness that suggests dangling locks. Which we can't explain, except waving our hands and saying "C++/JNI code throws a RuntimeException that bypasses a mutex unlock and is caught upstream somewhere and the app tries to keep running". (Which seems unlikely for about 3 reasons: 1. we haven't seen C++ code throw exceptions; 2. `synchronized` doesn't have an explicit unlock anyway; 3. we shouldn't be catching RuntimeExceptions and then not killing the app.)
196+
197+
So we're going to use a `ReentrantLock` and unlock it in `finally` blocks, so unlocking will definitely happen even if there's a RuntimeException.
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
#Wed May 15 11:31:39 EDT 2019
1+
#Mon Sep 09 16:34:14 EDT 2019
22
distributionBase=GRADLE_USER_HOME
33
distributionPath=wrapper/dists
44
zipStoreBase=GRADLE_USER_HOME
55
zipStorePath=wrapper/dists
6-
distributionUrl=https\://services.gradle.org/distributions/gradle-5.1.1-all.zip
6+
distributionUrl=https\://services.gradle.org/distributions/gradle-5.4.1-all.zip

psicashlib/src/androidTest/java/ca/psiphon/psicashlib/ActivePurchasesTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ public class ActivePurchasesTest extends TestBase {
99
@Test
1010
public void simpleSuccess() {
1111
PsiCashLibTester pcl = new PsiCashLibTester();
12-
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper());
12+
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper(), false);
1313
assertNull(err);
1414

1515
// Default value, before the first RefreshState
@@ -37,7 +37,7 @@ public void simpleSuccess() {
3737
@Test
3838
public void withPurchases() {
3939
PsiCashLibTester pcl = new PsiCashLibTester();
40-
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper());
40+
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper(), false);
4141
assertNull(err);
4242

4343
PsiCashLib.RefreshStateResult res = pcl.refreshState(null);

psicashlib/src/androidTest/java/ca/psiphon/psicashlib/BalanceTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ public class BalanceTest extends TestBase {
1010
@Test
1111
public void simpleSuccess() {
1212
PsiCashLibTester pcl = new PsiCashLibTester();
13-
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper());
13+
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper(), false);
1414
assertNull(err);
1515

1616
// Default value, before the first RefreshState
@@ -43,7 +43,7 @@ public void simpleSuccess() {
4343
@Test
4444
public void balanceChange() {
4545
PsiCashLibTester pcl = new PsiCashLibTester();
46-
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper());
46+
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper(), false);
4747
assertNull(err);
4848

4949
PsiCashLib.RefreshStateResult res = pcl.refreshState(null);

psicashlib/src/androidTest/java/ca/psiphon/psicashlib/ExpirePurchasesTest.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ public class ExpirePurchasesTest extends TestBase {
1010
@Test
1111
public void simpleSuccess() {
1212
PsiCashLibTester pcl = new PsiCashLibTester();
13-
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper());
13+
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper(), false);
1414
assertNull(err);
1515

1616
// Default value, before the first RefreshState
@@ -38,7 +38,7 @@ public void simpleSuccess() {
3838
@Test
3939
public void withPurchases() {
4040
PsiCashLibTester pcl = new PsiCashLibTester();
41-
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper());
41+
PsiCashLib.Error err = pcl.init(getTempDir(), new PsiCashLibHelper(), false);
4242
assertNull(err);
4343

4444
PsiCashLib.RefreshStateResult res = pcl.refreshState(null);

0 commit comments

Comments
 (0)