1. Fixed problem with to() implementation for modules where parameters and/or buffers were declared on a base class.

NiklasGustafsson · NiklasGustafsson · commit ad7ad4efc6a5 · 2024-10-30T15:30:46.000-07:00
2. Addressed issue with ParameterDict and ParameterList not doing _to() properly.
diff --git a/src/TorchSharp/NN/Convolution/Convolution.cs b/src/TorchSharp/NN/Convolution/Convolution.cs
@@ -49,7 +49,7 @@ protected Convolution(string name, long in_channels, long out_channels, long[] k
                 this.padding_mode = padding_mode;
 
                 // Set this so the constructor doesn't give a non-null error, and the actual value is set in the
-                // SetPadding function called right after. 
+                // SetPadding function called right after.
                 this._reversed_padding_repeated_twice = Array.Empty<long>();
                 if (padding_type.HasValue)
                     SetPadding(padding_type.Value);
diff --git a/src/TorchSharp/NN/Module.cs b/src/TorchSharp/NN/Module.cs
diff --git a/src/TorchSharp/NN/Normalization/NormBase.cs b/src/TorchSharp/NN/Normalization/NormBase.cs
@@ -3,6 +3,7 @@
 using static TorchSharp.torch;
 using static TorchSharp.torch.nn;
 using static TorchSharp.PInvoke.NativeMethods;
+
 #nullable enable
 namespace TorchSharp
 {
@@ -14,13 +15,13 @@ namespace Modules
     {
         public abstract class NormBase : torch.nn.Module<Tensor, Tensor>
         {
-            public NormBase(long num_features, 
-                            double eps, 
-                            double? momentum, 
-                            bool affine, 
-                            bool track_running_stats, 
-                            Device? device, 
-                            ScalarType? dtype, 
+            public NormBase(long num_features,
+                            double eps,
+                            double? momentum,
+                            bool affine,
+                            bool track_running_stats,
+                            Device? device,
+                            ScalarType? dtype,
                             string name) : base(name)
             {
                 this.num_features = num_features;
@@ -115,15 +116,15 @@ public Tensor? num_batches_tracked {
                     ConditionallyRegisterBuffer(nameof(num_batches_tracked), _num_batches_tracked);
                 }
             }
-            
+
             public long num_features { get; private set; }
-            
+
             public double eps { get; set; }
-            
+
             public double? momentum { get; set; }
 
             public bool affine { get; private set; }
-            
+
             public bool track_running_stats { get; private set; }
 
             [ComponentName(Name = nameof(bias))]
diff --git a/src/TorchSharp/NN/ParameterDict.cs b/src/TorchSharp/NN/ParameterDict.cs
@@ -14,7 +14,7 @@ namespace Modules
     {
         /// <summary>
         /// Holds parameters in a dictionary.
-        /// 
+        ///
         /// ParameterDict can be indexed like a regular dictionary, but the parameters it
         /// contains are properly registered, and will be visible by all Module methods.
         ///
@@ -60,34 +60,43 @@ protected override void RegisterComponents()
 
             private bool _registered = false;
 
-            protected internal override Module _to(DeviceType deviceType, int deviceIndex, bool non_blocking)
-            {
-                base._to(deviceType, deviceIndex, non_blocking);
-                _toEpilog();
-                return this;
-            }
-
-            protected internal override Module _to(torch.Device device, torch.ScalarType dtype, bool non_blocking)
-            {
-                base._to(device, dtype, non_blocking);
-                _toEpilog();
-                return this;
-            }
-
-            protected internal override Module _to(torch.ScalarType dtype, bool non_blocking)
-            {
-                base._to(dtype, non_blocking);
-                _toEpilog();
-                return this;
-            }
-
-            void _toEpilog()
+            protected override void _toEpilog(torch.ScalarType? dtype, torch.Device device, bool non_blocking)
             {
                 for (int i = 0; i < _list.Count; i++) {
                     string name = _list[i].Item1;
-                    var param = base.get_parameter(name);
-                    _list[i] = (name, param);
-                    _dict[name] = param;
+                    var param = _list[i].Item2;
+
+                    using var grad = param.grad;
+
+                    if (!param.toWillCopy(dtype ?? param.dtype, device ?? param.device) &&
+                        (grad is null || !grad.toWillCopy(dtype ?? param.dtype, device ?? param.device)))
+                        continue;
+
+                    Parameter p;
+                    torch.ScalarType paramType =
+                        dtype != null && (param.dtype.IsFloatingPoint() || param.dtype.IsComplex()) ? dtype.Value : param.dtype;
+
+                    // When moving the parameter, we don't want the autograd to track this movement on the graph.
+                    // In addition, we need the new tensor to be a leaf to accumulate gradients, so if we didn't
+                    // disable grad we would need to call .detach() on the moved tensor.
+                    using (var d = torch.no_grad()) {
+                        p = new Parameter(
+                            data: param.to(paramType, device ?? param.device),
+                            requires_grad: param.requires_grad);
+                        _ = p.DetachFromDisposeScope();
+
+                        // Copy the gradient over as well, if it exists
+                        if (grad is not null) {
+                            using var newGrad = grad.to(paramType, device ?? param.device)
+                                .with_requires_grad(grad.requires_grad);
+                            p.grad = newGrad;
+                        }
+                    }
+
+                    param?.Dispose();
+
+                    _list[i] = (name, p);
+                    _dict[name] = p;
                 }
             }
 
@@ -136,6 +145,12 @@ public override Parameter get_parameter(string target)
                 return null;
             }
 
+            public override IEnumerable<(string name, Parameter parameter)> named_parameters(bool recurse)
+            {
+                // Ignore the 'recurse' parameter.
+                return _dict.Select(d => (d.Key, d.Value));
+            }
+
             public void Add((string, Parameter) item)
             {
                 _dict.Add(item.Item1, item.Item2);
diff --git a/src/TorchSharp/NN/ParameterList.cs b/src/TorchSharp/NN/ParameterList.cs
@@ -33,35 +33,6 @@ protected override void RegisterComponents()
                 _registered = true;
             }
 
-
-            protected internal override Module _to(DeviceType deviceType, int deviceIndex, bool non_blocking)
-            {
-                base._to(deviceType, deviceIndex, non_blocking);
-                _toEpilog();
-                return this;
-            }
-
-            protected internal override Module _to(torch.Device device, torch.ScalarType dtype, bool non_blocking)
-            {
-                base._to(device, dtype, non_blocking);
-                _toEpilog();
-                return this;
-            }
-
-            protected internal override Module _to(torch.ScalarType dtype, bool non_blocking)
-            {
-                base._to(dtype, non_blocking);
-                _toEpilog();
-                return this;
-            }
-
-            void _toEpilog()
-            {
-                for (int i = 0; i < _list.Count; i++) {
-                    _list[i] = base.get_parameter($"{i}");
-                }
-            }
-
             public override IEnumerable<(string name, Parameter parameter)> named_parameters(bool recurse = true)
             {
                 return Enumerable.Range(0, _list.Count).Select(i => ($"{i}", _list[i]));
@@ -80,6 +51,46 @@ public override Parameter get_parameter(string target)
                 return null;
             }
 
+            protected override void _toEpilog(torch.ScalarType? dtype, torch.Device device, bool non_blocking)
+            {
+                for (int i = 0; i < _list.Count; i++) {
+
+                    string name = $"{i}";
+                    var param = _list[i];
+
+                    using var grad = param.grad;
+
+                    if (!param.toWillCopy(dtype ?? param.dtype, device ?? param.device) &&
+                        (grad is null || !grad.toWillCopy(dtype ?? param.dtype, device ?? param.device)))
+                        continue;
+
+                    Parameter p;
+                    torch.ScalarType paramType =
+                        dtype != null && (param.dtype.IsFloatingPoint() || param.dtype.IsComplex()) ? dtype.Value : param.dtype;
+
+                    // When moving the parameter, we don't want the autograd to track this movement on the graph.
+                    // In addition, we need the new tensor to be a leaf to accumulate gradients, so if we didn't
+                    // disable grad we would need to call .detach() on the moved tensor.
+                    using (var d = torch.no_grad()) {
+                        p = new Parameter(
+                            data: param.to(paramType, device ?? param.device),
+                            requires_grad: param.requires_grad);
+                        _ = p.DetachFromDisposeScope();
+
+                        // Copy the gradient over as well, if it exists
+                        if (grad is not null) {
+                            using var newGrad = grad.to(paramType, device ?? param.device)
+                                .with_requires_grad(grad.requires_grad);
+                            p.grad = newGrad;
+                        }
+                    }
+
+                    param?.Dispose();
+
+                    _list[i] = p;
+                }
+            }
+
             private bool _registered = false;
 
             public Parameter this[int index] {
diff --git a/src/TorchVision/models/AlexNet.cs b/src/TorchVision/models/AlexNet.cs
@@ -24,7 +24,7 @@ public static partial class models
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.alexnet(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -105,7 +105,7 @@ public AlexNet(int numClasses, float dropout = 0.5f, string? weights_file = null
 
                 if (!string.IsNullOrEmpty(weights_file)) {
 
-                    this.load(weights_file, skip: skipfc ? new[] { "classifier.6.weight", "classifier.6.bias" } : null);
+                    this.load(weights_file!, skip: skipfc ? new[] { "classifier.6.weight", "classifier.6.bias" } : null);
                 }
 
                 if (device != null && device.type != DeviceType.CPU)
diff --git a/src/TorchVision/models/GoogleNet.cs b/src/TorchVision/models/GoogleNet.cs
@@ -26,7 +26,7 @@ public static partial class models
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.inception_v3(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -170,7 +170,7 @@ public GoogleNet(int numClasses = 1000,
                             break;
                         }
                     }
-                    this.load(weights_file, skip: skipfc ? new[] { "fc.weight", "fc.bias", "AuxLogits.fc.weight", "AuxLogits.fc.bias" } : null);
+                    this.load(weights_file!, skip: skipfc ? new[] { "fc.weight", "fc.bias", "AuxLogits.fc.weight", "AuxLogits.fc.bias" } : null);
                 }
 
                 if (device != null && device.type != DeviceType.CPU)
diff --git a/src/TorchVision/models/InceptionV3.cs b/src/TorchVision/models/InceptionV3.cs
@@ -25,7 +25,7 @@ public static partial class models
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.inception_v3(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -170,7 +170,7 @@ public InceptionV3(int numClasses = 1000,
                             break;
                         }
                     }
-                    this.load(weights_file, skip: skipfc ? new[] { "fc.weight", "fc.bias", "AuxLogits.fc.weight", "AuxLogits.fc.bias" } : null);
+                    this.load(weights_file!, skip: skipfc ? new[] { "fc.weight", "fc.bias", "AuxLogits.fc.weight", "AuxLogits.fc.bias" } : null);
                 }
 
                 if (device != null && device.type != DeviceType.CPU)
diff --git a/src/TorchVision/models/ResNet.cs b/src/TorchVision/models/ResNet.cs
@@ -30,7 +30,7 @@ public static partial class models
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.resnet18(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -86,7 +86,7 @@ public static Modules.ResNet resnet18(
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.resnet34(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -142,7 +142,7 @@ public static Modules.ResNet resnet34(
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.resnet50(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -197,7 +197,7 @@ public static Modules.ResNet resnet50(
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.wide_resnet50_2(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -251,7 +251,7 @@ public static Modules.ResNet wide_resnet50_2(
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.resnext50_32x4d(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -305,7 +305,7 @@ public static Modules.ResNet resnext50_32x4d(
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.resnet101(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -360,7 +360,7 @@ public static Modules.ResNet resnet101(
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.resnext101_32x8d(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -413,7 +413,7 @@ public static Modules.ResNet resnext101_32x8d(
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.resnext101_32x8d(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -466,7 +466,7 @@ public static Modules.ResNet resnext101_64x4d(
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.resnet101(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -521,7 +521,7 @@ public static Modules.ResNet wide_resnet101_2(
             ///
             /// from torchvision import models
             /// import exportsd
-            /// 
+            ///
             /// model = models.resnet152(pretrained=True)
             /// f = open("model_weights.dat", "wb")
             /// exportsd.save_state_dict(model.state_dict(), f)
@@ -825,7 +825,7 @@ public ResNet(string name,
 
                 } else {
 
-                    this.load(weights_file, skip: skipfc ? new[] { "fc.weight", "fc.bias" } : null);
+                    this.load(weights_file!, skip: skipfc ? new[] { "fc.weight", "fc.bias" } : null);
                 }
 
                 if (device != null && device.type != DeviceType.CPU)
diff --git a/src/TorchVision/models/VGG.cs b/src/TorchVision/models/VGG.cs
diff --git a/test/TorchSharpTest/NN.cs b/test/TorchSharpTest/NN.cs
diff --git a/test/TorchSharpTest/TestTorchTensorBugs.cs b/test/TorchSharpTest/TestTorchTensorBugs.cs

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@ public static partial class models`
`26`	`26`	`///`
`27`	`27`	`/// from torchvision import models`
`28`	`28`	`/// import exportsd`
`29`		`- ///`
	`29`	`+ ///`
`30`	`30`	`/// model = models.inception_v3(pretrained=True)`
`31`	`31`	`/// f = open("model_weights.dat", "wb")`
`32`	`32`	`/// exportsd.save_state_dict(model.state_dict(), f)`
`@@ -170,7 +170,7 @@ public GoogleNet(int numClasses = 1000,`
`170`	`170`	`break;`
`171`	`171`	`}`
`172`	`172`	`}`
`173`		`- this.load(weights_file, skip: skipfc ? new[] { "fc.weight", "fc.bias", "AuxLogits.fc.weight", "AuxLogits.fc.bias" } : null);`
	`173`	`+ this.load(weights_file!, skip: skipfc ? new[] { "fc.weight", "fc.bias", "AuxLogits.fc.weight", "AuxLogits.fc.bias" } : null);`
`174`	`174`	`}`
`175`	`175`
`176`	`176`	`if (device != null && device.type != DeviceType.CPU)`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ public static partial class models`
`25`	`25`	`///`
`26`	`26`	`/// from torchvision import models`
`27`	`27`	`/// import exportsd`
`28`		`- ///`
	`28`	`+ ///`
`29`	`29`	`/// model = models.inception_v3(pretrained=True)`
`30`	`30`	`/// f = open("model_weights.dat", "wb")`
`31`	`31`	`/// exportsd.save_state_dict(model.state_dict(), f)`
`@@ -170,7 +170,7 @@ public InceptionV3(int numClasses = 1000,`
`170`	`170`	`break;`
`171`	`171`	`}`
`172`	`172`	`}`
`173`		`- this.load(weights_file, skip: skipfc ? new[] { "fc.weight", "fc.bias", "AuxLogits.fc.weight", "AuxLogits.fc.bias" } : null);`
	`173`	`+ this.load(weights_file!, skip: skipfc ? new[] { "fc.weight", "fc.bias", "AuxLogits.fc.weight", "AuxLogits.fc.bias" } : null);`
`174`	`174`	`}`
`175`	`175`
`176`	`176`	`if (device != null && device.type != DeviceType.CPU)`