Skip to content

Commit 46a6edb

Browse files
MaxGhenisclaude
andcommitted
Clean up analysis.py - remove old revenue methodology
Remove all references to: - Savills stock estimates (11,481) - RoS sales data (391) - Council-level sales allocations - Revenue calculations (£18.5m) - Band I/J surcharge rates Now uses simple Band H property counts from NRS directly. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 837d1de commit 46a6edb

File tree

1 file changed

+71
-274
lines changed
  • scotland-mansion-tax/src/scotland_mansion_tax

1 file changed

+71
-274
lines changed
Lines changed: 71 additions & 274 deletions
Original file line numberDiff line numberDiff line change
@@ -1,81 +1,20 @@
11
"""
2-
Core analysis module for Scottish Mansion Tax calculations.
2+
Core analysis module for Scottish Mansion Tax - Band H property distribution.
33
4-
Revenue Calculation:
5-
Revenue = Stock × Average Rate
6-
= 11,481 × £1,607
7-
= £18.5m
4+
We use Council Tax Band H as a proxy for £1m+ properties:
5+
- Band H threshold: >£212k in 1991 ≈ ~£1m today
6+
- Scotland has 16,011 Band H properties across 2.83M dwellings (0.57%)
87
9-
Where:
10-
- Stock (11,481): Total £1m+ properties in Scotland (Savills, 2022)
11-
- Average Rate (£1,607): (89% × £1,500) + (11% × £2,500)
12-
- Band split from Savills 2024: 416 sales £1m-£2m, 50 sales £2m+
13-
14-
Sales data (391 from RoS) is only used for GEOGRAPHIC DISTRIBUTION,
15-
not for calculating total revenue.
8+
Data source: National Records of Scotland Small Area Statistics 2024
169
"""
1710

1811
from pathlib import Path
19-
from typing import Dict, Optional
12+
from typing import Optional
2013

2114
import pandas as pd
2215

23-
from scotland_mansion_tax.data import load_population_data, load_wealth_factors
24-
25-
# Surcharge rates (benchmark - Scotland rates not yet announced)
26-
# Source: https://www.gov.uk/government/publications/high-value-council-tax-surcharge
27-
BAND_I_SURCHARGE = 1_500 # £1,500/year for £1m-£2m properties
28-
BAND_J_SURCHARGE = 2_500 # £2,500/year for £2m+ properties
29-
30-
# Stock estimate from Savills (February 2023)
31-
# Source: https://www.savills.com/insight-and-opinion/savills-news/339380/
32-
ESTIMATED_STOCK = 11_481 # Exact figure from Savills research
33-
34-
# Band distribution (from Savills 2024 data)
35-
# Source: https://www.savills.co.uk/research_articles/229130/372275-0
36-
# 2024: 416 sales £1m-£2m, 50 sales £2m+ (total 466)
37-
BAND_I_RATIO = 416 / 466 # £1m-£2m = 89.3%
38-
BAND_J_RATIO = 50 / 466 # £2m+ = 10.7%
16+
from scotland_mansion_tax.data import load_wealth_factors, get_data_dir
3917

40-
# Council-level £1m+ sales estimates
41-
# Primary source: Registers of Scotland Property Market Report 2024-25
42-
# https://www.ros.gov.uk/data-and-statistics/property-market-statistics/property-market-report-2024-25
43-
ROS_REPORTED_TOTAL = 391 # Official RoS figure for validation reference
44-
45-
COUNCIL_DATA = {
46-
"City of Edinburgh": 200, # >50% per RoS; EH3 (53) + EH4 (49) + EH9/10/12 (~98)
47-
"East Lothian": 35, # North Berwick area (EH39: 18 + surrounding)
48-
"Fife": 30, # St Andrews (KY16: 22 + surrounding)
49-
"East Dunbartonshire": 25, # Bearsden (G61: 15 + surrounding)
50-
"Aberdeen City": 20, # AB15 and central Aberdeen
51-
"Aberdeenshire": 15, # Rural Aberdeenshire
52-
"Glasgow City": 15, # G12, G41 areas
53-
"Perth and Kinross": 12, # Perth, Auchterarder
54-
"Stirling": 10, # Bridge of Allan, Dunblane
55-
"Highland": 10, # Inverness, rural Highlands
56-
"East Renfrewshire": 10, # Newton Mearns (G77)
57-
"Scottish Borders": 8, # Melrose, Kelso
58-
"South Ayrshire": 7, # Ayr coastal
59-
"Argyll and Bute": 6, # Helensburgh, Oban
60-
"Midlothian": 5, # Dalkeith area
61-
"West Lothian": 5, # Linlithgow
62-
"South Lanarkshire": 3,
63-
"North Lanarkshire": 2,
64-
"Renfrewshire": 2,
65-
"Inverclyde": 1,
66-
"Falkirk": 1,
67-
"Clackmannanshire": 1,
68-
"Dumfries and Galloway": 1,
69-
"Dundee City": 1,
70-
"Angus": 1,
71-
"Moray": 1,
72-
"North Ayrshire": 1,
73-
"West Dunbartonshire": 1,
74-
"East Ayrshire": 0,
75-
"Eilean Siar": 0,
76-
"Orkney Islands": 0,
77-
"Shetland Islands": 0,
78-
}
7918

8019
# Constituency to council mapping (Scottish Parliament 2021 boundaries)
8120
CONSTITUENCY_COUNCIL_MAPPING = {
@@ -184,227 +123,85 @@
184123
"Shetland Islands": "Shetland Islands",
185124
}
186125

187-
# Expected number of constituencies
188-
EXPECTED_CONSTITUENCIES = 73
189-
190126

191-
def calculate_wealth_adjusted_weights(
192-
population_df: pd.DataFrame, wealth_factors: Dict[str, float]
193-
) -> Dict[str, dict]:
194-
"""Calculate wealth-adjusted weights within each council.
195-
196-
Weight = (Population × Wealth Factor) / Sum(Population × Wealth Factor for council)
197-
198-
Args:
199-
population_df: DataFrame with constituency populations.
200-
wealth_factors: Dict mapping constituency -> wealth factor.
201-
202-
Returns:
203-
Dict mapping constituency -> {council, population, wealth_factor, weight}.
204-
"""
205-
weights = {}
206-
207-
# Group constituencies by council with adjusted values
208-
council_data = {}
209-
for constituency, council in CONSTITUENCY_COUNCIL_MAPPING.items():
210-
if council not in council_data:
211-
council_data[council] = []
212-
213-
# Find population for this constituency
214-
pop_row = population_df[population_df["constituency"] == constituency]
215-
if len(pop_row) == 0:
216-
raise ValueError(f"No population data for {constituency}")
217-
pop = pop_row["population"].values[0]
218-
219-
# Get wealth adjustment factor
220-
if constituency not in wealth_factors:
221-
raise ValueError(f"No wealth factor for {constituency}")
222-
wealth_factor = wealth_factors[constituency]
223-
224-
# Adjusted value = population × wealth factor
225-
adjusted_value = pop * wealth_factor
226-
227-
council_data[council].append((constituency, pop, wealth_factor, adjusted_value))
228-
229-
# Calculate weights within each council using adjusted values
230-
for council, constituencies in council_data.items():
231-
total_adjusted = sum(adj for _, _, _, adj in constituencies)
232-
for constituency, pop, wealth_factor, adjusted_value in constituencies:
233-
# Weight based on adjusted value, not raw population
234-
weight = (
235-
adjusted_value / total_adjusted
236-
if total_adjusted > 0
237-
else 1 / len(constituencies)
238-
)
239-
weights[constituency] = {
240-
"council": council,
241-
"population": pop,
242-
"wealth_factor": wealth_factor,
243-
"weight": weight,
244-
}
245-
246-
return weights
247-
248-
249-
def analyze_constituencies(
250-
data_dir: Optional[Path] = None, verbose: bool = True
127+
def generate_band_h_csv(
128+
data_dir: Optional[Path] = None,
129+
output_path: Optional[Path] = None,
130+
verbose: bool = True
251131
) -> pd.DataFrame:
252-
"""Distribute council-level estimates to constituencies using wealth-adjusted weights.
132+
"""Generate CSV with Band H properties by constituency.
253133
254134
Args:
255-
data_dir: Directory containing data files. Defaults to package data dir.
135+
data_dir: Directory containing NRS data files.
136+
output_path: Where to save the CSV. If None, returns DataFrame only.
256137
verbose: Print progress messages.
257138
258139
Returns:
259-
DataFrame with constituency-level analysis results.
140+
DataFrame with constituency, council, band_h_properties,
141+
total_dwellings, pct_band_h.
260142
"""
261-
if verbose:
262-
print("=" * 70)
263-
print("Scottish Mansion Tax Analysis by Parliament Constituency")
264-
print("Using wealth-adjusted weights (population × Band H factor)")
265-
print("=" * 70)
266-
267-
# Load population data
268-
if verbose:
269-
print("\n📊 Loading NRS population data...")
270-
population_df = load_population_data(data_dir, verbose)
271-
if verbose:
272-
print(f" ✓ Loaded {len(population_df)} constituencies")
273-
274-
# Load wealth factors from Council Tax Band H data
275-
if verbose:
276-
print("\n💎 Loading Council Tax Band H data (wealth proxy)...")
277-
wealth_factors = load_wealth_factors(data_dir, verbose)
278-
if verbose:
279-
print(f" ✓ Loaded wealth factors for {len(wealth_factors)} constituencies")
143+
if data_dir is None:
144+
data_dir = get_data_dir()
280145

281-
# Calculate wealth-adjusted weights
282146
if verbose:
283-
print("\n📈 Calculating wealth-adjusted weights...")
284-
weights = calculate_wealth_adjusted_weights(population_df, wealth_factors)
285-
286-
# Calculate total sales for normalization
287-
total_sales = sum(COUNCIL_DATA.values())
288-
289-
results = []
147+
print("Loading Band H data from NRS...")
290148

291-
for constituency, data in weights.items():
292-
council = data["council"]
293-
weight = data["weight"]
294-
population = data["population"]
295-
wealth_factor = data["wealth_factor"]
149+
# Load dwelling estimates with Band H
150+
dwelling_file = data_dir / "dwelling_estimates_by_dz.xlsx"
151+
df = pd.read_excel(dwelling_file, sheet_name="2023", header=4)
152+
df.columns = df.columns.str.replace("\n", " ").str.strip()
296153

297-
# Get council's total sales
298-
if council not in COUNCIL_DATA:
299-
raise ValueError(f"Council {council} not in COUNCIL_DATA")
300-
council_sales = COUNCIL_DATA[council]
154+
dz_data = df[["Data Zone code", "Total number of dwellings", "Council Tax band: H"]].copy()
155+
dz_data.columns = ["DataZone", "TotalDwellings", "BandH"]
156+
dz_data = dz_data.dropna(subset=["DataZone"])
301157

302-
# Allocate to constituency based on wealth-adjusted weight
303-
constituency_sales = council_sales * weight
158+
# Load DZ to Constituency lookup
159+
lookup = pd.read_csv(data_dir / "dz_to_constituency_lookup.csv")
304160

305-
# Calculate share of total
306-
share = constituency_sales / total_sales if total_sales > 0 else 0
161+
# Merge and aggregate
162+
merged = dz_data.merge(lookup, on="DataZone", how="left")
163+
constituency_data = merged.groupby("ConstituencyCode").agg({
164+
"TotalDwellings": "sum",
165+
"BandH": "sum"
166+
}).reset_index()
307167

308-
# Band breakdown
309-
band_i_sales = constituency_sales * BAND_I_RATIO
310-
band_j_sales = constituency_sales * BAND_J_RATIO
168+
# Load constituency names
169+
names = pd.read_csv(data_dir / "constituency_names.csv")
170+
name_lookup = dict(zip(names["Code"], names["Name"]))
311171

312-
# Calculate implied revenue from sales using UK rates
313-
implied_from_sales = (band_i_sales * BAND_I_SURCHARGE) + (
314-
band_j_sales * BAND_J_SURCHARGE
315-
)
316-
317-
rounded_sales = round(constituency_sales)
318-
results.append(
319-
{
320-
"constituency": constituency,
321-
"council": council,
322-
"population": population,
323-
"wealth_factor": wealth_factor,
324-
"weight": round(weight, 4),
325-
"estimated_sales": rounded_sales,
326-
"band_i_sales": round(band_i_sales),
327-
"band_j_sales": round(band_j_sales),
328-
"share_pct": round(share * 100, 2) if rounded_sales > 0 else 0,
329-
"implied_from_sales": round(implied_from_sales)
330-
if rounded_sales > 0
331-
else 0,
332-
}
333-
)
334-
335-
df = pd.DataFrame(results)
336-
df = df.sort_values("estimated_sales", ascending=False)
337-
338-
# Calculate total revenue using simple formula: Stock × Average Rate
339-
avg_rate = BAND_I_RATIO * BAND_I_SURCHARGE + BAND_J_RATIO * BAND_J_SURCHARGE
340-
total_stock_revenue = ESTIMATED_STOCK * avg_rate # 11,481 × £1,607 = £18.5m
341-
342-
# Allocate total revenue proportionally by each constituency's share
343-
df["allocated_revenue"] = (df["share_pct"] / 100 * total_stock_revenue).round(0)
172+
# Build output
173+
results = []
174+
for _, row in constituency_data.iterrows():
175+
name = name_lookup.get(row["ConstituencyCode"], row["ConstituencyCode"])
176+
total = int(row["TotalDwellings"])
177+
band_h = int(row["BandH"])
178+
pct = (band_h / total * 100) if total > 0 else 0
179+
council = CONSTITUENCY_COUNCIL_MAPPING.get(name, "Unknown")
180+
results.append({
181+
"constituency": name,
182+
"council": council,
183+
"band_h_properties": band_h,
184+
"total_dwellings": total,
185+
"pct_band_h": round(pct, 4)
186+
})
187+
188+
out_df = pd.DataFrame(results)
189+
out_df = out_df.sort_values("pct_band_h", ascending=False)
344190

345191
if verbose:
346-
# Print summary
347-
print(f"\n📊 Total constituencies: {len(df)}")
348-
print(
349-
f"📈 Total £1m+ sales: {df['estimated_sales'].sum():.0f} (for geographic distribution)"
350-
)
351-
print(f"🏠 Estimated £1m+ stock: {ESTIMATED_STOCK:,} (Savills)")
352-
print(f"\n💰 Revenue calculation:")
353-
print(f" Band I rate: £{BAND_I_SURCHARGE:,}/year ({BAND_I_RATIO:.1%} of properties)")
354-
print(f" Band J rate: £{BAND_J_SURCHARGE:,}/year ({BAND_J_RATIO:.1%} of properties)")
355-
print(f" Average rate: £{avg_rate:,.0f}/year")
356-
print(
357-
f" Formula: Stock × Avg Rate = {ESTIMATED_STOCK:,} × £{avg_rate:,.0f} = £{total_stock_revenue/1e6:.1f}m"
358-
)
359-
360-
print("\n🏛️ Top 10 Constituencies by Impact:")
361-
print("-" * 90)
362-
print(f"{'Constituency':<40} {'Council':<20} {'Sales':>6} {'Revenue':>12}")
363-
print("-" * 90)
364-
365-
for _, row in df.head(10).iterrows():
366-
council_short = (
367-
row["council"][:19] if len(row["council"]) > 19 else row["council"]
368-
)
369-
print(
370-
f"{row['constituency']:<40} {council_short:<20} "
371-
f"{row['estimated_sales']:>6} £{row['allocated_revenue']/1e6:>10.2f}m"
372-
)
373-
374-
# Edinburgh subtotal
375-
edinburgh_df = df[df["council"] == "City of Edinburgh"]
376-
print(f"\n📍 Edinburgh Total (6 constituencies):")
377-
print(
378-
f" {edinburgh_df['estimated_sales'].sum():.0f} sales, "
379-
f"£{edinburgh_df['allocated_revenue'].sum()/1e6:.1f}m "
380-
f"({edinburgh_df['share_pct'].sum():.1f}%)"
381-
)
382-
383-
return df
384-
385-
386-
def get_summary_stats(df: pd.DataFrame) -> dict:
387-
"""Get summary statistics from analysis results.
388-
389-
Args:
390-
df: DataFrame from analyze_constituencies()
391-
392-
Returns:
393-
Dictionary with summary statistics.
394-
"""
395-
avg_rate = BAND_I_RATIO * BAND_I_SURCHARGE + BAND_J_RATIO * BAND_J_SURCHARGE
396-
397-
edinburgh_df = df[df["council"] == "City of Edinburgh"]
398-
399-
return {
400-
"total_constituencies": len(df),
401-
"constituencies_with_sales": len(df[df["estimated_sales"] > 0]),
402-
"total_sales": df["estimated_sales"].sum(),
403-
"estimated_stock": ESTIMATED_STOCK,
404-
"total_revenue": df["allocated_revenue"].sum(),
405-
"average_rate": avg_rate,
406-
"edinburgh_revenue": edinburgh_df["allocated_revenue"].sum(),
407-
"edinburgh_share_pct": edinburgh_df["share_pct"].sum(),
408-
"top_constituency": df.iloc[0]["constituency"],
409-
"top_constituency_revenue": df.iloc[0]["allocated_revenue"],
410-
}
192+
total_band_h = out_df['band_h_properties'].sum()
193+
total_dwellings = out_df['total_dwellings'].sum()
194+
print(f"Scotland total: {total_band_h:,} Band H properties")
195+
print(f"Scotland total: {total_dwellings:,} dwellings")
196+
print(f"Scotland average: {total_band_h / total_dwellings * 100:.2f}%")
197+
print()
198+
print("Top 5 by % Band H:")
199+
for _, row in out_df.head(5).iterrows():
200+
print(f" {row['constituency']}: {row['pct_band_h']:.2f}%")
201+
202+
if output_path:
203+
out_df.to_csv(output_path, index=False)
204+
if verbose:
205+
print(f"\nSaved to {output_path}")
206+
207+
return out_df

0 commit comments

Comments
 (0)