Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions Compiling.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,3 @@ As also mentioned in the instructions below but repeated here for visibility, if
* Pre-trained neural nets are available at [the main training website](https://katagotraining.org/).
* You will probably want to edit `configs/gtp_example.cfg` (see "Tuning for Performance" above).
* If using OpenCL, you will want to verify that KataGo is picking up the correct device when you run it (e.g. some systems may have both an Intel CPU OpenCL and GPU OpenCL, if KataGo appears to pick the wrong one, you can correct this by specifying `openclGpuToUse` in `configs/gtp_example.cfg`).
* If you want to run `synchronous_loop.sh` on macOS, do the following steps:
* Install GNU coreutils `brew install coreutils` to support a `head` tool that can take negative numbers (`head -n -5` in `train.sh`)
* Install GNU findutils `brew install findutils` to support a `find` tool that supports `-printf` option, that's used by `export_model_for_selfplay.sh`. After that, fix `find` with `gfind` in the script.
Note: you can try to avoid fixing `export_model_for_selfplay.sh` by adjusting `PATH` with the installed findutils: `export PATH="/opt/homebrew/opt/findutils/libexec/gnubin:$PATH"` or by using the alias `alias find="gfind"`. However, it works not always.
8 changes: 7 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,11 @@ elseif(USE_BACKEND STREQUAL "METAL")
message(FATAL_ERROR "Project requires building with AppleClang. Have ${CMAKE_CXX_COMPILER_ID}")
endif()
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/external/macos/cmake/modules")

if (NOT CMAKE_OSX_SYSROOT)
execute_process(COMMAND xcrun --show-sdk-path OUTPUT_VARIABLE CMAKE_OSX_SYSROOT OUTPUT_STRIP_TRAILING_WHITESPACE)
endif()

include(InitializeSwift)
include(AddSwift)
set(CMAKE_OSX_DEPLOYMENT_TARGET 13.0)
Expand Down Expand Up @@ -341,7 +346,6 @@ elseif(USE_BACKEND STREQUAL "TENSORRT")
if((NOT TENSORRT_INCLUDE_DIR))
message(FATAL_ERROR "${ColorBoldRed} NvInfer.h was NOT found, specify TENSORRT_INCLUDE_DIR to indicate where it is. ${ColorReset}")
endif()
find_library(TENSORRT_LIBRARY nvinfer HINTS ${TENSORRT_ROOT_DIR} PATH_SUFFIXES lib)

# Hackily extract out the version from the TensorRT header
# In each case, try the old format and on failure try the new format.
Expand Down Expand Up @@ -388,6 +392,8 @@ elseif(USE_BACKEND STREQUAL "TENSORRT")
message(FATAL_ERROR "Could not determine TensorRT version from header file")
endif()

find_library(TENSORRT_LIBRARY NAMES nvinfer nvinfer_${TENSORRT_VERSION_MAJOR} HINTS ${TENSORRT_ROOT_DIR} PATH_SUFFIXES lib)

# Version 8 is required for serializing the builder timing cache.
# Version 8.2 is required for eliminating the global logger for Builder and Runtime.
# Version 8.5 is required for eliminating many deprecated APIs and adopting new features.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ shift
#and using gating disables the export script from making extraneous selfplay data dirs.
USEGATING=1

if command -v python3 >/dev/null 2>&1; then
PYTHON=python3
else
PYTHON=python
fi

GITROOTDIR="$(git rev-parse --show-toplevel)"

basedir="$(realpath "$BASEDIRRAW")"
Expand Down Expand Up @@ -75,7 +81,7 @@ cp -r "$GITROOTDIR"/python/selfplay "$DATED_ARCHIVE"
while true
do
echo "BEGINNING SUMMARIZE------------------------------"
time python3 ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
time $PYTHON ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
-old-summary-file-to-assume-correct "$basedir"/selfplay.summary.json \
-new-summary-file "$basedir"/selfplay.summary.json.tmp
mv "$basedir"/selfplay.summary.json.tmp "$basedir"/selfplay.summary.json
Expand Down
42 changes: 27 additions & 15 deletions python/selfplay/distributed/upload_model_for_selfplay.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ shift

#------------------------------------------------------------------------------

if command -v python3 >/dev/null 2>&1; then
PYTHON=python3
else
PYTHON=python
fi

Comment on lines +28 to +33
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a good way to guard against the case where "python3" doesn't exist and "python" is "python2"? In that case it would probably be best to fail rather than attempt to run python-3 scripts.

mkdir -p "$BASEDIR"/modelstobetested
mkdir -p "$BASEDIR"/modelsuploaded

Expand All @@ -33,9 +39,15 @@ function uploadStuff() {
TODIR="$2"

#Sort by timestamp so that we process in order of oldest to newest if there are multiple
for FILEPATH in $(find "$BASEDIR"/"$FROMDIR"/ -mindepth 1 -maxdepth 1 -printf "%T@ %p\n" | sort -n | cut -d ' ' -f 2)
# Use python here to avoid 'find -printf' which is not portable to macOS
# Use sys.argv to safely pass directory name with spaces/quotes
$PYTHON -c "import os, sys; d=sys.argv[1]; print('\n'.join(sorted([os.path.join(d, f) for f in os.listdir(d)], key=lambda x: os.path.getmtime(x))))" "$BASEDIR/$FROMDIR" 2>/dev/null | while read -r FILEPATH
do
if [ ${FILEPATH: -10} == ".uploading" ]
if [ -z "$FILEPATH" ]
then
continue
fi
if [ "${FILEPATH: -10}" == ".uploading" ]
then
echo "Skipping upload tmp file:" "$FILEPATH"
else
Expand All @@ -49,23 +61,23 @@ function uploadStuff() {

if [ -d "$BASEDIR"/modelsuploaded/"$NAME" ]
then
echo "Model with same name aleady exists, so skipping:" "$SRC"
echo "Model with same name already exists, so skipping:" "$SRC"
else
rm -rf "$TMPDST"
mkdir "$TMPDST"

TOBEZIPPED="$TMPDST"/"$RUNNAME"-"$NAME"
TOBEZIPPED="$TMPDST/$RUNNAME-$NAME"
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any particular reason for changing the quoting style on this and subsequent lines? Just a style thing or is there some other reason?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No particular reason, just a style thing (suggested by Junie AI assistance). I can rollback to make the changes minimal.

mkdir "$TOBEZIPPED"

# Build zip containing the ckpt
cp "$SRC"/model.ckpt "$TOBEZIPPED"/model.ckpt
(cd "$TMPDST"; zip -r "$RUNNAME"-"$NAME".zip "$RUNNAME"-"$NAME"/)
cp "$SRC/model.ckpt" "$TOBEZIPPED/model.ckpt"
(cd "$TMPDST"; zip -r "$RUNNAME-$NAME.zip" "$RUNNAME-$NAME/")
rm "$TOBEZIPPED"/*
rmdir "$TOBEZIPPED"

cp "$SRC"/model.bin.gz "$TMPDST"/"$RUNNAME"-"$NAME".bin.gz
cp "$SRC"/metadata.json "$TMPDST"/metadata.json
cp "$SRC"/log.txt "$TMPDST"/log.txt
cp "$SRC/model.bin.gz" "$TMPDST/$RUNNAME-$NAME.bin.gz"
cp "$SRC/metadata.json" "$TMPDST/metadata.json"
cp "$SRC/log.txt" "$TMPDST/log.txt"

#Sleep a little to allow some tolerance on the filesystem
sleep 3
Expand All @@ -76,13 +88,13 @@ function uploadStuff() {
do
set +e
set -x
python3 ./upload_model.py \
$PYTHON ./upload_model.py \
-run-name "$RUNNAME" \
-model-name "$RUNNAME"-"$NAME" \
-model-file "$TMPDST"/"$RUNNAME"-"$NAME".bin.gz \
-model-zip "$TMPDST"/"$RUNNAME"-"$NAME".zip \
-upload-log-file "$TMPDST"/upload_log.txt \
-metadata-file "$TMPDST"/metadata.json \
-model-name "$RUNNAME-$NAME" \
-model-file "$TMPDST/$RUNNAME-$NAME.bin.gz" \
-model-zip "$TMPDST/$RUNNAME-$NAME.zip" \
-upload-log-file "$TMPDST/upload_log.txt" \
-metadata-file "$TMPDST/metadata.json" \
-parents-dir "$TARGETDIR" \
-connection-config "$CONNECTION_CONFIG" \
-rating-only "$RATING_ONLY"
Expand Down
45 changes: 28 additions & 17 deletions python/selfplay/export_model_for_selfplay.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ shift

#------------------------------------------------------------------------------

if command -v python3 >/dev/null 2>&1; then
PYTHON=python3
else
PYTHON=python
fi

mkdir -p "$BASEDIR"/torchmodels_toexport
mkdir -p "$BASEDIR"/torchmodels_toexport_extra
mkdir -p "$BASEDIR"/modelstobetested
Expand All @@ -33,47 +39,53 @@ function exportStuff() {
FROMDIR="$1"
TODIR="$2"

#Sort by timestamp so that we process in order of oldest to newest if there are multiple
for FILEPATH in $(find "$BASEDIR"/"$FROMDIR"/ -mindepth 1 -maxdepth 1 -printf "%T@ %p\n" | sort -n | cut -d ' ' -f 2)
# Sort by timestamp so that we process in order of oldest to newest if there are multiple
# Use python here to avoid 'find -printf' which is not portable to macOS
# Use sys.argv to safely pass directory name with spaces/quotes
$PYTHON -c "import os, sys; d=sys.argv[1]; print('\n'.join(sorted([os.path.join(d, f) for f in os.listdir(d)], key=lambda x: os.path.getmtime(x))))" "$BASEDIR/$FROMDIR" 2>/dev/null | while read -r FILEPATH
do
#Make sure to skip tmp directories that are transiently there by the training,
#they are probably in the process of being written
if [ ${FILEPATH: -4} == ".tmp" ]
if [ -z "$FILEPATH" ]
then
continue
fi
if [ "${FILEPATH: -4}" == ".tmp" ]
then
echo "Skipping tmp file:" "$FILEPATH"
elif [ ${FILEPATH: -9} == ".exported" ]
elif [ "${FILEPATH: -9}" == ".exported" ]
then
echo "Skipping self tmp file:" "$FILEPATH"
else
echo "Found model to export:" "$FILEPATH"
NAME="$(basename "$FILEPATH")"

SRC="$BASEDIR"/"$FROMDIR"/"$NAME"
TMPDST="$BASEDIR"/"$FROMDIR"/"$NAME".exported
TARGET="$BASEDIR"/"$TODIR"/"$NAME"
SRC="$BASEDIR/$FROMDIR/$NAME"
TMPDST="$BASEDIR/$FROMDIR/$NAME.exported"
TARGET="$BASEDIR/$TODIR/$NAME"

if [ -d "$BASEDIR"/modelstobetested/"$NAME" ] || \
[ -d "$BASEDIR"/rejectedmodels/"$NAME" ] || \
[ -d "$BASEDIR"/models/"$NAME" ] || \
[ -d "$BASEDIR"/models_extra/"$NAME" ] || \
[ -d "$BASEDIR"/modelsuploaded/"$NAME" ]
then
echo "Model with same name aleady exists, so skipping:" "$SRC"
echo "Model with same name already exists, so skipping:" "$SRC"
else
rm -rf "$TMPDST"
mkdir "$TMPDST"

set -x
python3 ./export_model_pytorch.py \
-checkpoint "$SRC"/model.ckpt \
$PYTHON ./export_model_pytorch.py \
-checkpoint "$SRC/model.ckpt" \
-export-dir "$TMPDST" \
-model-name "$NAMEPREFIX""-""$NAME" \
-model-name "$NAMEPREFIX-$NAME" \
-filename-prefix model \
-use-swa

python3 ./clean_checkpoint.py \
-checkpoint "$SRC"/model.ckpt \
-output "$TMPDST"/model.ckpt
$PYTHON ./clean_checkpoint.py \
-checkpoint "$SRC/model.ckpt" \
-output "$TMPDST/model.ckpt"
set +x

rm -r "$SRC"
Expand All @@ -87,9 +99,8 @@ function exportStuff() {
then
if [ "$TODIR" != "models_extra" ]
then
mkdir -p "$BASEDIR"/selfplay/"$NAME"
mkdir -p "$BASEDIR"/selfplay/"$NAME"/sgfs
mkdir -p "$BASEDIR"/selfplay/"$NAME"/tdata
mkdir -p "$BASEDIR/selfplay/$NAME/sgfs"
mkdir -p "$BASEDIR/selfplay/$NAME/tdata"
fi
fi

Expand Down
14 changes: 10 additions & 4 deletions python/selfplay/shuffle.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ shift

#------------------------------------------------------------------------------

if command -v python3 >/dev/null 2>&1; then
PYTHON=python3
else
PYTHON=python
fi

OUTDIR=$(date "+%Y%m%d-%H%M%S")

mkdir -p "$BASEDIR"/shuffleddata/"$OUTDIR".tmp
Expand All @@ -36,7 +42,7 @@ echo "Beginning shuffle at" $(date "+%Y-%m-%d %H:%M:%S")
if [[ -n "${SKIP_VALIDATE:-}" ]]
then
(
time python3 ./shuffle.py \
time $PYTHON ./shuffle.py \
"$BASEDIR"/selfplay/ \
-expand-window-per-row 0.4 \
-taper-window-exponent 0.65 \
Expand All @@ -56,7 +62,7 @@ then
else
# Randomly peels off 5% of files generated by selfplay as validation data
(
time python3 ./shuffle.py \
time $PYTHON ./shuffle.py \
"$BASEDIR"/selfplay/ \
-expand-window-per-row 0.4 \
-taper-window-exponent 0.65 \
Expand All @@ -74,7 +80,7 @@ else
wait
)
(
time python3 ./shuffle.py \
time $PYTHON ./shuffle.py \
"$BASEDIR"/selfplay/ \
-expand-window-per-row 0.4 \
-taper-window-exponent 0.65 \
Expand Down Expand Up @@ -110,7 +116,7 @@ mv "$BASEDIR"/shuffleddata/"$OUTDIR".tmp "$BASEDIR"/shuffleddata/"$OUTDIR"
#This should be VERY conservative and allow plenty of time for the training to switch
#to newer ones as they get generated.
echo "Cleaning up any old dirs"
find "$BASEDIR"/shuffleddata/ -mindepth 1 -maxdepth 1 -type d -mmin +120 | sort | head -n -5 | xargs --no-run-if-empty rm -r
find "$BASEDIR"/shuffleddata/ -mindepth 1 -maxdepth 1 -type d -mmin +120 -print0 | sort -z | head -z -n -5 | xargs -0 --no-run-if-empty rm -r
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does "-z" exist in MacOS as a flag for these commands? https://ss64.com/mac/sort.html doesn't list it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed it looks like macOS doesn't support the -z argument. However, I was able to run the script without problems on macOS, it's strange.


echo "Finished shuffle at" $(date "+%Y-%m-%d %H:%M:%S")
#Make a little space between shuffles
Expand Down
8 changes: 7 additions & 1 deletion python/selfplay/shuffle_loop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ shift
BATCHSIZE="$1"
shift

if command -v python3 >/dev/null 2>&1; then
PYTHON=python3
else
PYTHON=python
fi

GITROOTDIR="$(git rev-parse --show-toplevel)"

basedir="$(realpath "$BASEDIRRAW")"
Expand All @@ -44,7 +50,7 @@ cp -r "$GITROOTDIR"/python/selfplay "$DATED_ARCHIVE"
while true
do
rm -f "$basedir"/selfplay.summary.json.tmp
time python3 ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
time $PYTHON ./summarize_old_selfplay_files.py "$basedir"/selfplay/ \
-old-summary-file-to-assume-correct "$basedir"/selfplay.summary.json \
-new-summary-file "$basedir"/selfplay.summary.json.tmp
mv "$basedir"/selfplay.summary.json.tmp "$basedir"/selfplay.summary.json
Expand Down
9 changes: 8 additions & 1 deletion python/selfplay/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ EXPORTMODE="$1"
shift

#------------------------------------------------------------------------------

if command -v python3 >/dev/null 2>&1; then
PYTHON=python3
else
PYTHON=python
fi

set -x

mkdir -p "$BASEDIR"/train/"$TRAININGNAME"
Expand Down Expand Up @@ -72,7 +79,7 @@ else
exit 1
fi

time python3 ./train.py \
time $PYTHON ./train.py \
-traindir "$BASEDIR"/train/"$TRAININGNAME" \
-latestdatadir "$BASEDIR"/shuffleddata/ \
-exportdir "$BASEDIR"/"$EXPORT_SUBDIR" \
Expand Down