|
1 | 1 | """ |
2 | | -Core analysis module for Scottish Mansion Tax calculations. |
| 2 | +Core analysis module for Scottish Mansion Tax - Band H property distribution. |
3 | 3 |
|
4 | | -Revenue Calculation: |
5 | | - Revenue = Stock × Average Rate |
6 | | - = 11,481 × £1,607 |
7 | | - = £18.5m |
| 4 | +We use Council Tax Band H as a proxy for £1m+ properties: |
| 5 | +- Band H threshold: >£212k in 1991 ≈ ~£1m today |
| 6 | +- Scotland has 16,011 Band H properties across 2.83M dwellings (0.57%) |
8 | 7 |
|
9 | | - Where: |
10 | | - - Stock (11,481): Total £1m+ properties in Scotland (Savills, 2022) |
11 | | - - Average Rate (£1,607): (89% × £1,500) + (11% × £2,500) |
12 | | - - Band split from Savills 2024: 416 sales £1m-£2m, 50 sales £2m+ |
13 | | -
|
14 | | - Sales data (391 from RoS) is only used for GEOGRAPHIC DISTRIBUTION, |
15 | | - not for calculating total revenue. |
| 8 | +Data source: National Records of Scotland Small Area Statistics 2024 |
16 | 9 | """ |
17 | 10 |
|
18 | 11 | from pathlib import Path |
19 | | -from typing import Dict, Optional |
| 12 | +from typing import Optional |
20 | 13 |
|
21 | 14 | import pandas as pd |
22 | 15 |
|
23 | | -from scotland_mansion_tax.data import load_population_data, load_wealth_factors |
24 | | - |
25 | | -# Surcharge rates (benchmark - Scotland rates not yet announced) |
26 | | -# Source: https://www.gov.uk/government/publications/high-value-council-tax-surcharge |
27 | | -BAND_I_SURCHARGE = 1_500 # £1,500/year for £1m-£2m properties |
28 | | -BAND_J_SURCHARGE = 2_500 # £2,500/year for £2m+ properties |
29 | | - |
30 | | -# Stock estimate from Savills (February 2023) |
31 | | -# Source: https://www.savills.com/insight-and-opinion/savills-news/339380/ |
32 | | -ESTIMATED_STOCK = 11_481 # Exact figure from Savills research |
33 | | - |
34 | | -# Band distribution (from Savills 2024 data) |
35 | | -# Source: https://www.savills.co.uk/research_articles/229130/372275-0 |
36 | | -# 2024: 416 sales £1m-£2m, 50 sales £2m+ (total 466) |
37 | | -BAND_I_RATIO = 416 / 466 # £1m-£2m = 89.3% |
38 | | -BAND_J_RATIO = 50 / 466 # £2m+ = 10.7% |
| 16 | +from scotland_mansion_tax.data import load_wealth_factors, get_data_dir |
39 | 17 |
|
40 | | -# Council-level £1m+ sales estimates |
41 | | -# Primary source: Registers of Scotland Property Market Report 2024-25 |
42 | | -# https://www.ros.gov.uk/data-and-statistics/property-market-statistics/property-market-report-2024-25 |
43 | | -ROS_REPORTED_TOTAL = 391 # Official RoS figure for validation reference |
44 | | - |
45 | | -COUNCIL_DATA = { |
46 | | - "City of Edinburgh": 200, # >50% per RoS; EH3 (53) + EH4 (49) + EH9/10/12 (~98) |
47 | | - "East Lothian": 35, # North Berwick area (EH39: 18 + surrounding) |
48 | | - "Fife": 30, # St Andrews (KY16: 22 + surrounding) |
49 | | - "East Dunbartonshire": 25, # Bearsden (G61: 15 + surrounding) |
50 | | - "Aberdeen City": 20, # AB15 and central Aberdeen |
51 | | - "Aberdeenshire": 15, # Rural Aberdeenshire |
52 | | - "Glasgow City": 15, # G12, G41 areas |
53 | | - "Perth and Kinross": 12, # Perth, Auchterarder |
54 | | - "Stirling": 10, # Bridge of Allan, Dunblane |
55 | | - "Highland": 10, # Inverness, rural Highlands |
56 | | - "East Renfrewshire": 10, # Newton Mearns (G77) |
57 | | - "Scottish Borders": 8, # Melrose, Kelso |
58 | | - "South Ayrshire": 7, # Ayr coastal |
59 | | - "Argyll and Bute": 6, # Helensburgh, Oban |
60 | | - "Midlothian": 5, # Dalkeith area |
61 | | - "West Lothian": 5, # Linlithgow |
62 | | - "South Lanarkshire": 3, |
63 | | - "North Lanarkshire": 2, |
64 | | - "Renfrewshire": 2, |
65 | | - "Inverclyde": 1, |
66 | | - "Falkirk": 1, |
67 | | - "Clackmannanshire": 1, |
68 | | - "Dumfries and Galloway": 1, |
69 | | - "Dundee City": 1, |
70 | | - "Angus": 1, |
71 | | - "Moray": 1, |
72 | | - "North Ayrshire": 1, |
73 | | - "West Dunbartonshire": 1, |
74 | | - "East Ayrshire": 0, |
75 | | - "Eilean Siar": 0, |
76 | | - "Orkney Islands": 0, |
77 | | - "Shetland Islands": 0, |
78 | | -} |
79 | 18 |
|
80 | 19 | # Constituency to council mapping (Scottish Parliament 2021 boundaries) |
81 | 20 | CONSTITUENCY_COUNCIL_MAPPING = { |
|
184 | 123 | "Shetland Islands": "Shetland Islands", |
185 | 124 | } |
186 | 125 |
|
187 | | -# Expected number of constituencies |
188 | | -EXPECTED_CONSTITUENCIES = 73 |
189 | | - |
190 | 126 |
|
191 | | -def calculate_wealth_adjusted_weights( |
192 | | - population_df: pd.DataFrame, wealth_factors: Dict[str, float] |
193 | | -) -> Dict[str, dict]: |
194 | | - """Calculate wealth-adjusted weights within each council. |
195 | | -
|
196 | | - Weight = (Population × Wealth Factor) / Sum(Population × Wealth Factor for council) |
197 | | -
|
198 | | - Args: |
199 | | - population_df: DataFrame with constituency populations. |
200 | | - wealth_factors: Dict mapping constituency -> wealth factor. |
201 | | -
|
202 | | - Returns: |
203 | | - Dict mapping constituency -> {council, population, wealth_factor, weight}. |
204 | | - """ |
205 | | - weights = {} |
206 | | - |
207 | | - # Group constituencies by council with adjusted values |
208 | | - council_data = {} |
209 | | - for constituency, council in CONSTITUENCY_COUNCIL_MAPPING.items(): |
210 | | - if council not in council_data: |
211 | | - council_data[council] = [] |
212 | | - |
213 | | - # Find population for this constituency |
214 | | - pop_row = population_df[population_df["constituency"] == constituency] |
215 | | - if len(pop_row) == 0: |
216 | | - raise ValueError(f"No population data for {constituency}") |
217 | | - pop = pop_row["population"].values[0] |
218 | | - |
219 | | - # Get wealth adjustment factor |
220 | | - if constituency not in wealth_factors: |
221 | | - raise ValueError(f"No wealth factor for {constituency}") |
222 | | - wealth_factor = wealth_factors[constituency] |
223 | | - |
224 | | - # Adjusted value = population × wealth factor |
225 | | - adjusted_value = pop * wealth_factor |
226 | | - |
227 | | - council_data[council].append((constituency, pop, wealth_factor, adjusted_value)) |
228 | | - |
229 | | - # Calculate weights within each council using adjusted values |
230 | | - for council, constituencies in council_data.items(): |
231 | | - total_adjusted = sum(adj for _, _, _, adj in constituencies) |
232 | | - for constituency, pop, wealth_factor, adjusted_value in constituencies: |
233 | | - # Weight based on adjusted value, not raw population |
234 | | - weight = ( |
235 | | - adjusted_value / total_adjusted |
236 | | - if total_adjusted > 0 |
237 | | - else 1 / len(constituencies) |
238 | | - ) |
239 | | - weights[constituency] = { |
240 | | - "council": council, |
241 | | - "population": pop, |
242 | | - "wealth_factor": wealth_factor, |
243 | | - "weight": weight, |
244 | | - } |
245 | | - |
246 | | - return weights |
247 | | - |
248 | | - |
249 | | -def analyze_constituencies( |
250 | | - data_dir: Optional[Path] = None, verbose: bool = True |
| 127 | +def generate_band_h_csv( |
| 128 | + data_dir: Optional[Path] = None, |
| 129 | + output_path: Optional[Path] = None, |
| 130 | + verbose: bool = True |
251 | 131 | ) -> pd.DataFrame: |
252 | | - """Distribute council-level estimates to constituencies using wealth-adjusted weights. |
| 132 | + """Generate CSV with Band H properties by constituency. |
253 | 133 |
|
254 | 134 | Args: |
255 | | - data_dir: Directory containing data files. Defaults to package data dir. |
| 135 | + data_dir: Directory containing NRS data files. |
| 136 | + output_path: Where to save the CSV. If None, returns DataFrame only. |
256 | 137 | verbose: Print progress messages. |
257 | 138 |
|
258 | 139 | Returns: |
259 | | - DataFrame with constituency-level analysis results. |
| 140 | + DataFrame with constituency, council, band_h_properties, |
| 141 | + total_dwellings, pct_band_h. |
260 | 142 | """ |
261 | | - if verbose: |
262 | | - print("=" * 70) |
263 | | - print("Scottish Mansion Tax Analysis by Parliament Constituency") |
264 | | - print("Using wealth-adjusted weights (population × Band H factor)") |
265 | | - print("=" * 70) |
266 | | - |
267 | | - # Load population data |
268 | | - if verbose: |
269 | | - print("\n📊 Loading NRS population data...") |
270 | | - population_df = load_population_data(data_dir, verbose) |
271 | | - if verbose: |
272 | | - print(f" ✓ Loaded {len(population_df)} constituencies") |
273 | | - |
274 | | - # Load wealth factors from Council Tax Band H data |
275 | | - if verbose: |
276 | | - print("\n💎 Loading Council Tax Band H data (wealth proxy)...") |
277 | | - wealth_factors = load_wealth_factors(data_dir, verbose) |
278 | | - if verbose: |
279 | | - print(f" ✓ Loaded wealth factors for {len(wealth_factors)} constituencies") |
| 143 | + if data_dir is None: |
| 144 | + data_dir = get_data_dir() |
280 | 145 |
|
281 | | - # Calculate wealth-adjusted weights |
282 | 146 | if verbose: |
283 | | - print("\n📈 Calculating wealth-adjusted weights...") |
284 | | - weights = calculate_wealth_adjusted_weights(population_df, wealth_factors) |
285 | | - |
286 | | - # Calculate total sales for normalization |
287 | | - total_sales = sum(COUNCIL_DATA.values()) |
288 | | - |
289 | | - results = [] |
| 147 | + print("Loading Band H data from NRS...") |
290 | 148 |
|
291 | | - for constituency, data in weights.items(): |
292 | | - council = data["council"] |
293 | | - weight = data["weight"] |
294 | | - population = data["population"] |
295 | | - wealth_factor = data["wealth_factor"] |
| 149 | + # Load dwelling estimates with Band H |
| 150 | + dwelling_file = data_dir / "dwelling_estimates_by_dz.xlsx" |
| 151 | + df = pd.read_excel(dwelling_file, sheet_name="2023", header=4) |
| 152 | + df.columns = df.columns.str.replace("\n", " ").str.strip() |
296 | 153 |
|
297 | | - # Get council's total sales |
298 | | - if council not in COUNCIL_DATA: |
299 | | - raise ValueError(f"Council {council} not in COUNCIL_DATA") |
300 | | - council_sales = COUNCIL_DATA[council] |
| 154 | + dz_data = df[["Data Zone code", "Total number of dwellings", "Council Tax band: H"]].copy() |
| 155 | + dz_data.columns = ["DataZone", "TotalDwellings", "BandH"] |
| 156 | + dz_data = dz_data.dropna(subset=["DataZone"]) |
301 | 157 |
|
302 | | - # Allocate to constituency based on wealth-adjusted weight |
303 | | - constituency_sales = council_sales * weight |
| 158 | + # Load DZ to Constituency lookup |
| 159 | + lookup = pd.read_csv(data_dir / "dz_to_constituency_lookup.csv") |
304 | 160 |
|
305 | | - # Calculate share of total |
306 | | - share = constituency_sales / total_sales if total_sales > 0 else 0 |
| 161 | + # Merge and aggregate |
| 162 | + merged = dz_data.merge(lookup, on="DataZone", how="left") |
| 163 | + constituency_data = merged.groupby("ConstituencyCode").agg({ |
| 164 | + "TotalDwellings": "sum", |
| 165 | + "BandH": "sum" |
| 166 | + }).reset_index() |
307 | 167 |
|
308 | | - # Band breakdown |
309 | | - band_i_sales = constituency_sales * BAND_I_RATIO |
310 | | - band_j_sales = constituency_sales * BAND_J_RATIO |
| 168 | + # Load constituency names |
| 169 | + names = pd.read_csv(data_dir / "constituency_names.csv") |
| 170 | + name_lookup = dict(zip(names["Code"], names["Name"])) |
311 | 171 |
|
312 | | - # Calculate implied revenue from sales using UK rates |
313 | | - implied_from_sales = (band_i_sales * BAND_I_SURCHARGE) + ( |
314 | | - band_j_sales * BAND_J_SURCHARGE |
315 | | - ) |
316 | | - |
317 | | - rounded_sales = round(constituency_sales) |
318 | | - results.append( |
319 | | - { |
320 | | - "constituency": constituency, |
321 | | - "council": council, |
322 | | - "population": population, |
323 | | - "wealth_factor": wealth_factor, |
324 | | - "weight": round(weight, 4), |
325 | | - "estimated_sales": rounded_sales, |
326 | | - "band_i_sales": round(band_i_sales), |
327 | | - "band_j_sales": round(band_j_sales), |
328 | | - "share_pct": round(share * 100, 2) if rounded_sales > 0 else 0, |
329 | | - "implied_from_sales": round(implied_from_sales) |
330 | | - if rounded_sales > 0 |
331 | | - else 0, |
332 | | - } |
333 | | - ) |
334 | | - |
335 | | - df = pd.DataFrame(results) |
336 | | - df = df.sort_values("estimated_sales", ascending=False) |
337 | | - |
338 | | - # Calculate total revenue using simple formula: Stock × Average Rate |
339 | | - avg_rate = BAND_I_RATIO * BAND_I_SURCHARGE + BAND_J_RATIO * BAND_J_SURCHARGE |
340 | | - total_stock_revenue = ESTIMATED_STOCK * avg_rate # 11,481 × £1,607 = £18.5m |
341 | | - |
342 | | - # Allocate total revenue proportionally by each constituency's share |
343 | | - df["allocated_revenue"] = (df["share_pct"] / 100 * total_stock_revenue).round(0) |
| 172 | + # Build output |
| 173 | + results = [] |
| 174 | + for _, row in constituency_data.iterrows(): |
| 175 | + name = name_lookup.get(row["ConstituencyCode"], row["ConstituencyCode"]) |
| 176 | + total = int(row["TotalDwellings"]) |
| 177 | + band_h = int(row["BandH"]) |
| 178 | + pct = (band_h / total * 100) if total > 0 else 0 |
| 179 | + council = CONSTITUENCY_COUNCIL_MAPPING.get(name, "Unknown") |
| 180 | + results.append({ |
| 181 | + "constituency": name, |
| 182 | + "council": council, |
| 183 | + "band_h_properties": band_h, |
| 184 | + "total_dwellings": total, |
| 185 | + "pct_band_h": round(pct, 4) |
| 186 | + }) |
| 187 | + |
| 188 | + out_df = pd.DataFrame(results) |
| 189 | + out_df = out_df.sort_values("pct_band_h", ascending=False) |
344 | 190 |
|
345 | 191 | if verbose: |
346 | | - # Print summary |
347 | | - print(f"\n📊 Total constituencies: {len(df)}") |
348 | | - print( |
349 | | - f"📈 Total £1m+ sales: {df['estimated_sales'].sum():.0f} (for geographic distribution)" |
350 | | - ) |
351 | | - print(f"🏠 Estimated £1m+ stock: {ESTIMATED_STOCK:,} (Savills)") |
352 | | - print(f"\n💰 Revenue calculation:") |
353 | | - print(f" Band I rate: £{BAND_I_SURCHARGE:,}/year ({BAND_I_RATIO:.1%} of properties)") |
354 | | - print(f" Band J rate: £{BAND_J_SURCHARGE:,}/year ({BAND_J_RATIO:.1%} of properties)") |
355 | | - print(f" Average rate: £{avg_rate:,.0f}/year") |
356 | | - print( |
357 | | - f" Formula: Stock × Avg Rate = {ESTIMATED_STOCK:,} × £{avg_rate:,.0f} = £{total_stock_revenue/1e6:.1f}m" |
358 | | - ) |
359 | | - |
360 | | - print("\n🏛️ Top 10 Constituencies by Impact:") |
361 | | - print("-" * 90) |
362 | | - print(f"{'Constituency':<40} {'Council':<20} {'Sales':>6} {'Revenue':>12}") |
363 | | - print("-" * 90) |
364 | | - |
365 | | - for _, row in df.head(10).iterrows(): |
366 | | - council_short = ( |
367 | | - row["council"][:19] if len(row["council"]) > 19 else row["council"] |
368 | | - ) |
369 | | - print( |
370 | | - f"{row['constituency']:<40} {council_short:<20} " |
371 | | - f"{row['estimated_sales']:>6} £{row['allocated_revenue']/1e6:>10.2f}m" |
372 | | - ) |
373 | | - |
374 | | - # Edinburgh subtotal |
375 | | - edinburgh_df = df[df["council"] == "City of Edinburgh"] |
376 | | - print(f"\n📍 Edinburgh Total (6 constituencies):") |
377 | | - print( |
378 | | - f" {edinburgh_df['estimated_sales'].sum():.0f} sales, " |
379 | | - f"£{edinburgh_df['allocated_revenue'].sum()/1e6:.1f}m " |
380 | | - f"({edinburgh_df['share_pct'].sum():.1f}%)" |
381 | | - ) |
382 | | - |
383 | | - return df |
384 | | - |
385 | | - |
386 | | -def get_summary_stats(df: pd.DataFrame) -> dict: |
387 | | - """Get summary statistics from analysis results. |
388 | | -
|
389 | | - Args: |
390 | | - df: DataFrame from analyze_constituencies() |
391 | | -
|
392 | | - Returns: |
393 | | - Dictionary with summary statistics. |
394 | | - """ |
395 | | - avg_rate = BAND_I_RATIO * BAND_I_SURCHARGE + BAND_J_RATIO * BAND_J_SURCHARGE |
396 | | - |
397 | | - edinburgh_df = df[df["council"] == "City of Edinburgh"] |
398 | | - |
399 | | - return { |
400 | | - "total_constituencies": len(df), |
401 | | - "constituencies_with_sales": len(df[df["estimated_sales"] > 0]), |
402 | | - "total_sales": df["estimated_sales"].sum(), |
403 | | - "estimated_stock": ESTIMATED_STOCK, |
404 | | - "total_revenue": df["allocated_revenue"].sum(), |
405 | | - "average_rate": avg_rate, |
406 | | - "edinburgh_revenue": edinburgh_df["allocated_revenue"].sum(), |
407 | | - "edinburgh_share_pct": edinburgh_df["share_pct"].sum(), |
408 | | - "top_constituency": df.iloc[0]["constituency"], |
409 | | - "top_constituency_revenue": df.iloc[0]["allocated_revenue"], |
410 | | - } |
| 192 | + total_band_h = out_df['band_h_properties'].sum() |
| 193 | + total_dwellings = out_df['total_dwellings'].sum() |
| 194 | + print(f"Scotland total: {total_band_h:,} Band H properties") |
| 195 | + print(f"Scotland total: {total_dwellings:,} dwellings") |
| 196 | + print(f"Scotland average: {total_band_h / total_dwellings * 100:.2f}%") |
| 197 | + print() |
| 198 | + print("Top 5 by % Band H:") |
| 199 | + for _, row in out_df.head(5).iterrows(): |
| 200 | + print(f" {row['constituency']}: {row['pct_band_h']:.2f}%") |
| 201 | + |
| 202 | + if output_path: |
| 203 | + out_df.to_csv(output_path, index=False) |
| 204 | + if verbose: |
| 205 | + print(f"\nSaved to {output_path}") |
| 206 | + |
| 207 | + return out_df |
0 commit comments