diff --git a/src/modality-specific-files/behavioral-experiments.md b/src/modality-specific-files/behavioral-experiments.md
index b077fd4b85..7f869ae852 100644
--- a/src/modality-specific-files/behavioral-experiments.md
+++ b/src/modality-specific-files/behavioral-experiments.md
@@ -1,9 +1,9 @@
-# Behavioral experiments (with no neural recordings)
+# Behavioral recordings
 
 !!! example "Example datasets"
 
     Datasets containing behavioral data can be found
-    in the [BIDS examples repository](https://bids-standard.github.io/bids-examples/#behavioral)
+    in the [BIDS examples repository](https://bids-website.readthedocs.io/en/latest/datasets/examples.html#behavioral)
     and can be used as helpful guidance when curating new datasets.
 
 <!--
@@ -15,19 +15,14 @@ and a guide for using macros can be found at
 -->
 {{ MACROS___make_filename_template("raw", datatypes=["beh"]) }}
 
-In addition to logs from behavioral experiments
-performed alongside imaging data acquisitions,
-one MAY also include data from experiments
-performed with no neural recordings.
-The results of those experiments MAY be stored in the `beh` directory
-using the same formats for event timing (`_events.tsv`),
-metadata (`_events.json`),
-physiological (`_physio.tsv.gz`, `_physio.json`)
-and other continuous recordings (`_stim.tsv.gz`, `_stim.json`)
-as for tasks performed during MRI, electrophysiological or other neural recordings.
-Additionally, events files
-that do not include the mandatory `onset` and `duration` columns
-MAY be included,
+The `beh` directory MAY store behavioral recordings such as audio (`_audio.*`) and video (`_video.*`) recordings, physiological (`_physio.*`) recordings, and other continuous recordings (`_stim.tsv.gz`, `_stim.json`).
+Audio and video recordings MAY be of subjects performing tasks, resting-state behavior, or recordings of stimuli being presented to the subject.
+Audio/video recordings MAY occur simultaneously with other recordings, such as BOLD or EEG.
+Relative timing between files may be determined by consulting the `scans.tsv` file.
+If no `scans.tsv` file is present, the alignment is undefined.
+The `beh` directory MAY also contain event timing files (`_events.tsv`) and their associated metadata (`_events.json`) for behavioral experiments that do not have corresponding neuroimaging or functional data.
+
+Additionally, events files that do not include the mandatory `onset` and `duration` columns MAY be included,
 but MUST be labeled `_beh.tsv` rather than `_events.tsv`.
 
 The following OPTIONAL columns are pre-defined for behavioral data files:
@@ -76,6 +71,167 @@ A guide for using macros can be found at
 -->
 {{ MACROS___make_sidecar_table("beh.BEHInstitutionInformation") }}
 
+## Audio and video recordings
+
+Audio and video recordings of behaving subjects MAY be stored in the `beh` directory
+using the `_audio` and `_video` suffixes respectively.
+These recordings are typically used to capture vocalizations, speech, facial expressions,
+body movements, or other behavioral aspects during experimental tasks or rest periods.
+
+!!! warning "Privacy and personally identifiable information"
+
+    Audio and video recordings of human subjects often contain personally identifiable
+    information (PII) such as faces, voices, and other identifying features.
+    Data curators MUST take special care to ensure compliance with applicable privacy
+    regulations (such as HIPAA in the United States, GDPR in the European Union, or other
+    local data protection laws) when handling these recordings.
+
+    These recordings are generally more suitable for internal use or for sharing
+    non-human subject data, unless appropriate privacy protections are implemented.
+
+### File formats
+
+Audio recordings MUST use one of the following extensions:
+
+-   `.flac` - Free Lossless Audio Codec
+-   `.mp3` - MPEG Audio Layer III
+-   `.ogg` - Ogg Vorbis
+-   `.wav` - Waveform Audio File Format
+
+Video recordings MUST use one of the following extensions:
+
+-   `.mp4` - MPEG-4 Part 14
+-   `.mkv` - Matroska video container
+-   `.avi` - Audio Video Interleave
+
+### Entities
+
+Audio and video files MAY use the following entities:
+
+-   `task` - OPTIONAL for audio and video recordings
+-   `acq` - OPTIONAL, can distinguish different recording setups
+-   `run` - OPTIONAL, for multiple recordings with identical parameters
+-   `recording` - OPTIONAL, to differentiate simultaneous recordings from different angles, locations, or devices
+-   `split` - OPTIONAL, for continuous recordings split into multiple files
+
+### Examples
+
+<!-- This block generates a file tree.
+A guide for using macros can be found at
+ https://github.com/bids-standard/bids-specification/blob/master/macros_doc.md
+-->
+{{ MACROS___make_filetree_example(
+   {
+   "sub-01": {
+      "beh": {
+         "sub-01_task-rest_video.mp4": "",
+         "sub-01_task-rest_video.json": "",
+         "sub-01_task-stroop_recording-face_video.mp4": "",
+         "sub-01_task-stroop_recording-face_video.json": "",
+         "sub-01_task-stroop_recording-room_video.mp4": "",
+         "sub-01_task-stroop_recording-room_video.json": "",
+         "sub-01_task-vocalization_audio.wav": "",
+         "sub-01_task-vocalization_audio.json": "",
+         },
+      },
+   }
+) }}
+
+For continuous recordings split into multiple files:
+
+<!-- This block generates a file tree.
+A guide for using macros can be found at
+ https://github.com/bids-standard/bids-specification/blob/master/macros_doc.md
+-->
+{{ MACROS___make_filetree_example(
+   {
+   "sub-01": {
+      "ses-01": {
+         "beh": {
+            "sub-01_ses-01_task-freeplay_run-01_split-001_video.mp4": "",
+            "sub-01_ses-01_task-freeplay_run-01_split-002_video.mp4": "",
+            "sub-01_ses-01_task-freeplay_run-01_split-003_video.mp4": "",
+            "sub-01_ses-01_task-freeplay_run-01_video.json": "",
+            },
+         },
+      },
+   }
+) }}
+
+### Sidecar JSON for audio and video recordings
+
+The following metadata fields are available for audio and video recordings:
+
+<!-- This block generates a metadata table.
+These tables are defined in
+  src/schema/rules/sidecars
+The definitions of the fields specified in these tables may be found in
+  src/schema/objects/metadata.yaml
+A guide for using macros can be found at
+ https://github.com/bids-standard/bids-specification/blob/master/macros_doc.md
+-->
+{{ MACROS___make_sidecar_table("beh.AudioVideoDevice") }}
+
+{{ MACROS___make_sidecar_table("beh.AudioVideoStreams") }}
+
+### Example video sidecar JSON
+
+For a video file containing both video and audio streams:
+
+```JSON
+{
+  "TaskName": "RestingState",
+  "Device": "Sony FDR-AX53",
+  "AudioChannelCount": 2,
+  "AudioSampleRate": 48000,
+  "FrameRate": 30.0,
+  "Height": 1920,
+  "Width": 1080,
+  "Duration": 600.5
+}
+```
+
+### Example audio sidecar JSON
+
+For an audio-only recording:
+
+```JSON
+{
+  "TaskName": "Vocalization",
+  "Device": "Zoom H6 Handy Recorder",
+  "AudioChannelCount": 2,
+  "AudioSampleRate": 44100,
+  "Duration": 300.2
+}
+```
+
+### Annotations and events
+
+Behavioral annotations or event markers for audio and video recordings
+SHOULD be stored in accompanying `_events.tsv` files following the standard
+[events file format](../modality-agnostic-files/events.md).
+These events files use the same filename entities as the audio/video file they describe,
+but with the `_events` suffix.
+
+For example:
+
+<!-- This block generates a file tree.
+A guide for using macros can be found at
+ https://github.com/bids-standard/bids-specification/blob/master/macros_doc.md
+-->
+{{ MACROS___make_filetree_example(
+   {
+   "sub-01": {
+      "beh": {
+         "sub-01_task-speech_audio.wav": "",
+         "sub-01_task-speech_audio.json": "",
+         "sub-01_task-speech_events.tsv": "",
+         "sub-01_task-speech_events.json": "",
+         },
+      },
+   }
+) }}
+
 ## Example `_beh.tsv`
 
 ```tsv
diff --git a/src/schema/objects/extensions.yaml b/src/schema/objects/extensions.yaml
index d482f8849d..b7fc5621bb 100644
--- a/src/schema/objects/extensions.yaml
+++ b/src/schema/objects/extensions.yaml
@@ -1,5 +1,11 @@
 ---
 # This file describes valid file extensions in the specification.
+avi:
+  value: .avi
+  display_name: Audio Video Interleave
+  description: |
+    An [Audio Video Interleave](https://en.wikipedia.org/wiki/Audio_Video_Interleave) video file.
+    This format is commonly used for behavioral video recordings.
 ave:
   value: .ave
   display_name: AVE # not sure what ave stands for
@@ -114,6 +120,12 @@ fif:
   display_name: Functional Imaging File Format
   description: |
     An MEG file format used by Neuromag, Elekta, and MEGIN.
+flac:
+  value: .flac
+  display_name: Free Lossless Audio Codec
+  description: |
+    A [FLAC](https://en.wikipedia.org/wiki/FLAC) audio file.
+    This format is commonly used for behavioral audio recordings.
 jpg:
   value: .jpg
   display_name: Joint Photographic Experts Group Format
@@ -153,6 +165,24 @@ md:
   display_name: Markdown
   description: |
     A Markdown file.
+mkv:
+  value: .mkv
+  display_name: Matroska Video
+  description: |
+    A [Matroska](https://www.matroska.org/) video container file.
+    This format is commonly used for behavioral video recordings.
+mp3:
+  value: .mp3
+  display_name: MPEG Audio Layer III
+  description: |
+    An [MP3](https://en.wikipedia.org/wiki/MP3) audio file.
+    This format is commonly used for behavioral audio recordings or auditory stimuli.
+mp4:
+  value: .mp4
+  display_name: MPEG-4 Part 14
+  description: |
+    An [MPEG-4 Part 14](https://en.wikipedia.org/wiki/MPEG-4_Part_14) video container file.
+    This format is commonly used for behavioral video recordings and may contain both video and audio streams.
 mefd:
   value: .mefd/
   display_name: Multiscale Electrophysiology File Format Version 3.0
@@ -201,6 +231,12 @@ nwb:
     A [Neurodata Without Borders](https://nwb-schema.readthedocs.io/en/latest/) file.
 
     Each recording consists of a single `.nwb` file.
+ogg:
+  value: .ogg
+  display_name: Ogg Vorbis
+  description: |
+    An [Ogg Vorbis](https://en.wikipedia.org/wiki/Vorbis) audio file.
+    This format is commonly used for behavioral audio recordings.
 OMEBigTiff:
   value: .ome.btf
   display_name: Open Microscopy Environment BigTIFF
@@ -286,6 +322,12 @@ tsv_gz:
     A gzipped tab-delimited file.
     This file extension is only used for very large tabular data, such as physiological recordings.
     For smaller data, the unzipped `.tsv` extension is preferred.
+wav:
+  value: .wav
+  display_name: Waveform Audio File Format
+  description: |
+    A [Waveform Audio File Format](https://en.wikipedia.org/wiki/WAV) audio file.
+    This format is commonly used for behavioral audio recordings.
 txt:
   value: .txt
   display_name: Text
diff --git a/src/schema/objects/metadata.yaml b/src/schema/objects/metadata.yaml
index e6567a57f5..eba58d3809 100644
--- a/src/schema/objects/metadata.yaml
+++ b/src/schema/objects/metadata.yaml
@@ -807,6 +807,13 @@ Descriptors:
     - type: array
       items:
         type: string
+Device:
+  name: Device
+  display_name: Device
+  description: |
+    Free-form description of the device used to record the data
+    (for example, `"iPhone 12"`, `"Canon EOS R5"`).
+  type: string
 DeviceSerialNumber:
   name: DeviceSerialNumber
   display_name: Device Serial Number
@@ -4197,3 +4204,63 @@ iEEGReference:
     this field should have a general description and the channel specific
     reference should be defined in the `channels.tsv` file.
   type: string
+
+AudioChannelCount:
+  name: AudioChannelCount
+  display_name: Audio Channel Count
+  description: |
+    Number of audio channels in the recording (for example, `2` for stereo).
+  type: integer
+  minimum: 1
+
+AudioDuration:
+  name: AudioDuration
+  display_name: Audio Duration
+  description: |
+    Duration of the audio recording in seconds.
+  type: number
+  exclusiveMinimum: 0
+  unit: s
+
+AudioSampleRate:
+  name: AudioSampleRate
+  display_name: Audio Sample Rate
+  description: |
+    Sample rate of the audio recording in Hertz (for example, `44100`).
+  type: number
+  exclusiveMinimum: 0
+  unit: Hz
+
+Duration:
+  name: Duration
+  display_name: Duration
+  description: |
+    Total duration of the audio or video recording in seconds.
+  type: number
+  exclusiveMinimum: 0
+  unit: s
+
+FrameRate:
+  name: FrameRate
+  display_name: Frame Rate
+  description: |
+    Frame rate of the video recording in frames per second (for example, `30.0`).
+  type: number
+  exclusiveMinimum: 0
+  unit: Hz
+
+Height:
+  name: Height
+  display_name: Video Height
+  description: |
+    Height of the video in pixels (for example, `1920`).
+  type: integer
+  minimum: 1
+
+Width:
+  name: Width
+  display_name: Video Width
+  description: |
+    Width of the video in pixels (for example, `1080`).
+  type: integer
+  minimum: 1
diff --git a/src/schema/objects/suffixes.yaml b/src/schema/objects/suffixes.yaml
index 1e2825ced2..6e6efb66b6 100644
--- a/src/schema/objects/suffixes.yaml
+++ b/src/schema/objects/suffixes.yaml
@@ -528,6 +528,13 @@ asllabeling:
     A deidentified screenshot of the planning of the labeling slab/plane
     with respect to the imaging slab or slices.
     This screenshot is based on DICOM macro C.8.13.5.14.
+audio:
+  value: audio
+  display_name: Audio Recording
+  description: |
+    Audio recording of a behaving subject.
+    This may include vocalizations, speech, or environmental sounds.
+    The audio stream may be from a standalone audio recording or extracted from a video file.
 beh:
   value: beh
   display_name: Behavioral recording
@@ -877,3 +884,11 @@ unloc:
   description: |
     MRS acquisitions run without localization.
     This includes signals detected using coil sensitivity only.
+video:
+  value: video
+  display_name: Video Recording
+  description: |
+    Video recording of a behaving subject.
+    This may include both video and audio streams.
+    Video recordings are typically used to capture behavior, facial expressions,
+    or body movements during experimental tasks or rest.
diff --git a/src/schema/rules/files/raw/beh.yaml b/src/schema/rules/files/raw/beh.yaml
index df6f9dac06..cd9e681f4a 100644
--- a/src/schema/rules/files/raw/beh.yaml
+++ b/src/schema/rules/files/raw/beh.yaml
@@ -9,3 +9,50 @@ noncontinuous:
     - .json
   datatypes:
     - beh
+  entities:
+    subject: required
+    session: optional
+    task: required
+    acquisition: optional
+    run: optional
+
+# Audio recordings
+audio:
+  suffixes:
+    - audio
+  extensions:
+    - .flac
+    - .mp3
+    - .ogg
+    - .wav
+    - .json
+  datatypes:
+    - beh
+  entities:
+    subject: required
+    session: optional
+    task: optional
+    acquisition: optional
+    run: optional
+    recording: optional
+    split: optional
+
+# Video recordings
+video:
+  suffixes:
+    - video
+  extensions:
+    - .mp4
+    - .mkv
+    - .avi
+    - .json
+  datatypes:
+    - beh
+  entities:
+    subject: required
+    session: optional
+    task: optional
+    acquisition: optional
+    run: optional
+    recording: optional
+    split: optional
diff --git a/src/schema/rules/sidecars/beh.yaml b/src/schema/rules/sidecars/beh.yaml
index f2d8410914..27fc43e12f 100644
--- a/src/schema/rules/sidecars/beh.yaml
+++ b/src/schema/rules/sidecars/beh.yaml
@@ -25,3 +25,24 @@ BEHInstitutionInformation:
     InstitutionName: recommended
     InstitutionAddress: recommended
     InstitutionalDepartmentName: recommended
+
+# Audio and Video metadata
+AudioVideoDevice:
+  selectors:
+    - datatype == "beh"
+    - intersects([suffix], ["audio", "video"])
+  fields:
+    Device: optional
+    DeviceSerialNumber: optional
+
+AudioVideoStreams:
+  selectors:
+    - datatype == "beh"
+    - intersects([suffix], ["audio", "video"])
+  fields:
+    AudioChannelCount: optional
+    AudioSampleRate: optional
+    FrameRate: optional
+    Height: optional
+    Width: optional
+    Duration: optional