meta-pytorch
diff --git a/‎.flake8‎
Lines changed: 3 additions & 0 deletions b/‎.flake8‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 66 additions & 0 deletions b/‎.github/workflows/test.yml‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎.gitignore‎
Lines changed: 17 additions & 0 deletions b/‎.gitignore‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎CODE_OF_CONDUCT.md‎
Lines changed: 76 additions & 0 deletions b/‎CODE_OF_CONDUCT.md‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 34 additions & 0 deletions b/‎CONTRIBUTING.md‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 16 additions & 0 deletions b/‎Cargo.toml‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎LICENSE‎
Lines changed: 29 additions & 0 deletions b/‎LICENSE‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 131 additions & 0 deletions b/‎README.md‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎clippy.toml‎
Lines changed: 6 additions & 0 deletions b/‎clippy.toml‎
Lines changed: 6 additions & 0 deletions
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 256
+extend-ignore = E302, G004, SIM105, G201, SIM115, SIM904
@@ -0,0 +1,66 @@
+name: Build monarch
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+      - gh/**
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  test:
+    name: cuda12.6-py3.10-4xlarge
+    strategy:
+      fail-fast: true
+      matrix:
+        include:
+          - name: 4xlarge
+            runs-on: linux.g5.4xlarge.nvidia.gpu
+            torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu126'
+            gpu-arch-type: "cuda"
+            gpu-arch-version: "12.6"
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      timeout: 60
+      runner: ${{ matrix.runs-on }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      submodules: recursive
+      script: |
+        conda create -n venv python=3.10 -y
+        conda activate venv
+        export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH
+        python -m pip install --upgrade pip
+
+        # Install native dependencies
+        dnf update -y
+        dnf install clang-devel libunwind libunwind-devel -y
+
+        # Install rust and setup nightly toolchain
+        curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+        source $HOME/.cargo/env
+        rustup toolchain install nightly
+        rustup default nightly
+
+        # Install torch
+        pip install torch
+
+        # Install Python dependencies
+        pip install setuptools-rust
+        pip install pyzmq requests numpy pyre-extensions
+
+        # Test dependencies
+        pip install pytest cloudpickle pytest-timeout pytest-asyncio
+
+        # Build and install monarch
+        python setup.py install
+
+        # Run tests
+        LC_ALL=C pytest python/tests/ -s -v -m "not oss_skip"
+        python python/tests/test_mock_cuda.py
@@ -0,0 +1,17 @@
+syntax: glob
+
+python/**/*.so
+python/**/*.json
+python/**/*.html
+python/**/*.pkl
+python/monarch.egg-info/*
+*.egg
+build/*
+dist/*
+monarch.egg-info/*
+python/monarch/monarch_controller
+
+.ipynb_checkpoints
+
+# Rust stuff
+target/
@@ -0,0 +1,76 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <[email protected]>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
@@ -0,0 +1,34 @@
+# Contributing to Meta Open Source Projects
+
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+Note: pull requests are not imported into the GitHub directory in the usual way. There is an internal Meta repository that is the "source of truth" for the project. The GitHub repository is generated *from* the internal Meta repository. So we don't merge GitHub PRs directly to the GitHub repository -- they must first be imported into internal Meta repository. When Meta employees look at the GitHub PR, there is a special button visible only to them that executes that import. The changes are then automatically reflected from the internal Meta repository back to GitHub. This is why you won't see your PR having being directly merged, but you still see your changes in the repository once it reflects the imported changes.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+## License
+By contributing to this project, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
@@ -0,0 +1,16 @@
+[workspace]
+
+members = [
+    "controller",
+    "hyper",
+    "hyperactor",
+    "hyperactor_macros",
+    "hyperactor_multiprocess",
+    "hyperactor_mesh",
+    "hyperactor_mesh_macros",
+    "ndslice",
+    "monarch_extension",
+    "monarch_worker",
+    "nccl-sys",
+    "torch-sys",
+]
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,131 @@
+# Monarch
+
+**Monarch** is a distributed execution engine for PyTorch.
+
+> ⚠️ **Early Development Warning**
+> Monarch is currently in an experimental stage. You should expect bugs, incomplete features, and APIs that may change in future versions. The project welcomes bugfixes, but to make sure things are well coordinated you should discuss any significant change before starting the work. It's recommended that you signal your intention to contribute in the issue tracker, either by filing a new issue or by claiming an existing one.
+
+## Installation
+
+```sh
+
+# Create and activate the conda environment
+conda create -n monarchenv python=3.10 -y
+conda activate monarchenv
+
+# Install nightly rust toolchain
+curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
+
+
+rustup toolchain install nightly
+rustup default nightly
+
+# Install non-python dependencies
+conda install python=3.10
+conda install libunwind
+
+# needs cuda-toolkit-12-0 as that is the version that matches the /usr/local/cuda/ on devservers
+sudo dnf install cuda-toolkit-12-0 cuda-12-0 libnccl-devel clang-devel
+# install build dependencies
+pip install setuptools-rust
+# install torch, can use conda or build it yourself or whatever
+pip install torch
+# install core deps, see pyproject.toml for latest
+pip install pyzmq requests numpy pyre-extensions cloudpickle
+# Install test dependencies
+pip install pytest pytest-timeout pytest-asyncio
+
+# install the package
+python setup.py install
+# or setup for development
+python setup.py develop
+
+# run unit tests. consider -s for more verbose output
+pytest python/tests/ -v -m "not oss_skip"
+```
+
+## Running examples
+
+TODO
+
+## Debugging
+
+If everything is hanging, set the environment
+`CONTROLLER_PYSPY_REPORT_INTERVAL=10` to get a py-spy dump of the controller and
+its subprocesses every 10 seconds.
+
+Calling `pdb.set_trace()` inside a worker remote function will cause pdb to
+attach to the controller process to debug the worker. Keep in mind that if there
+are multiple workers, this will create multiple sequential debug sessions for
+each worker.
+
+For the rust based setup you can adjust the log level with
+`RUST_LOG=<log level>` (eg. `RUST_LOG=debug`).
+
+## Profiling
+
+The `monarch.profiler` module provides functionality similar to
+[PyTorch's Profiler](https://pytorch.org/docs/stable/profiler.html) for model
+profiling. It includes `profile` and `record_function` methods. The usage is
+generally the same as `torch.profiler.profile` and
+`torch.profiler.record_function`, with a few modifications specific to
+`monarch.profiler.profile`:
+
+1. `monarch.profiler.profile` exclusively accepts `monarch.profiler.Schedule`, a
+   dataclass that mimics `torch.profiler.schedule`.
+2. The `on_trace_ready` argument in `monarch.profiler.profile` must be a string
+   that specifies the directory where the worker should save the trace files.
+
+Below is an example demonstrating how to use `monarch.profiler`:
+
+```py
+    from monarch.profiler import profile, record_function
+    with profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA,
+        ],
+        on_trace_ready="./traces/",
+        schedule=monarch.profilerSchedule(wait=1, warmup=1, active=2, repeat=1),
+        record_shapes=True,
+    ) as prof:
+        with record_function("forward"):
+            loss = model(batch)
+
+        prof.step()
+```
+
+## Memory Viewer
+
+The `monarch.memory` module provides functionality similar to
+[PyTorch's Memory Snapshot and Viewer](https://pytorch.org/docs/stable/torch_cuda_memory.html)
+for visualizing and analyzing memory usage in PyTorch models. It includes
+`monarch.memory.dump_memory_snapshot` and `monarch.memory.record_memory_history`
+methods:
+
+1. `monarch.memory.dump_memory_snapshot`: This function wraps
+   `torch.cuda.memory._dump_snapshot()` to dump memory snapshot remotely. It can
+   be used to save a snapshot of the current memory usage to a file.
+2. `monarch.memory.record_memory_history`: This function wraps
+   `torch.cuda.memory_record_memory_history()` to allow recording memory history
+   remotely. It can be used to track memory allocation and deallocation over
+   time.
+
+Both functions use `remote` to execute the corresponding remote functions
+`_memory_controller_record` and `_memory_controller_dump` on the specified
+device mesh.
+
+Below is an example demonstrating how to use `monarch.memory`:
+
+```py
+    ...
+    monarch.memory.record_memory_history()
+    for step in range(2):
+        batch = torch.randn((8, DIM))
+        loss = net(batch)
+        ...
+    monarch.memory.dump_memory_snapshot(dir_snapshots="./snapshots/")
+```
+
+## License
+Monarch is BSD-3 licensed, as found in the [LICENSE](LICENSE) file.
@@ -0,0 +1,6 @@
+disallowed-methods = [
+    { path = "tokio::time::sleep", reason = "use `hyperactor::clock::Clock::sleep` instead." },
+    { path = "std::thread::sleep", reason = "use `hyperactor::clock::Clock::sleep` instead." },
+    { path = "tokio::time::Instant::now", reason = "use `hyperactor::clock::Clock::now` instead." },
+    { path = "std::time::SystemTime::now", reason = "use `hyperactor::clock::Clock::system_time_now` instead." },
+]
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[flake8]`
	`2`	`+max-line-length = 256`
	`3`	`+extend-ignore = E302, G004, SIM105, G201, SIM115, SIM904`