linux-tools-misc/sync_archlinux.py at master · walkthetalk/linux-tools-misc · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python
# autor: Ni Qingliang
# NOTE: this script can be used to sync arch repository which is accessed through
#       http

import time
import datetime
import re
import os
import tempfile
import urllib.parse

from bs4 import BeautifulSoup
from subprocess import call
import fileinput

#g_host = "mirrors.aliyun.com"
#g_host = "mirrors.163.com"
#g_host = "mirror.bjtu.edu.cn"
g_host = "mirrors.neusoft.edu.cn"
g_loc_base_dir = "./repo"

class csub_rep:
	def __init__(self, repo_name, main_page, loc_dir):
		self._repo_name = repo_name
		self._main_page = main_page
		self._loc_dir = loc_dir

		self._fl_new = []
		self._fl_old = []
		# get index
		tmp_dir = tempfile.mkdtemp(prefix="syncarch_") + "/"
		call("wget --progress=bar"
			+ " -O index.html"
                        + " --timeout=60"
			+ " -P " + tmp_dir
			+ " " + self._main_page, shell=True)
		soup = BeautifulSoup(open("index.html"), "html.parser")
		os.rmdir(tmp_dir)

		tmp_node = soup.pre	# 163
		if not tmp_node:
			tmp_node = soup.find("table")	# bjtu
			if tmp_node:
				if tmp_node.find("tbody"):
					tmp_node = tmp_node.find("tbody")
		if not tmp_node:
			print(" no data found!!!")

		for a in tmp_node.find_all("a"):
			# string的内容如果太长，html显示的是省略号
			#self._fl_new.add(a.string);
			file_name = a["href"]
			#print(file_name)
			#if not file_name:
			#	continue
			#if re.match(".*/.*", file_name):
			#	continue
			# 去除上一级目录
			if file_name == "../":
				continue
			# 处理 %
			file_name = urllib.parse.unquote(string=file_name, errors="strict")
			# 移除开头的./，163的有问题
			self._fl_new.append(re.sub("^\./", "", file_name));

		#for f in self._fl_new:
		#	print(f)
		#print(len(self._fl_new))

		# create locale directory
		if not os.path.exists(self._loc_dir):
			os.makedirs(self._loc_dir)
		else:
			self._fl_old = os.listdir(self._loc_dir)
			#for f in self._fl_old:
			#	print(f)
			#print(len(self._fl_old))

		# abs list
		self.__abs_list = []
		for i in (
			".db",
			".db.tar.gz",
			".db.tar.gz.old",
			".files",
			".files.tar.gz",
			".files.tar.gz.old",
			".links.tar.gz"):
			self.__abs_list.append(self._repo_name + i)

	def download(self):
		dl_list = list(set(self._fl_new) - set(self._fl_old) - set(self.__abs_list))
		dl_list.sort()
		for i in dl_list:
			print("dling " + i)
			#call("wget -N --progress=bar -P " + self._loc_dir + " " + self._main_page + i, shell=True)
			cmd = "axel " \
				+ "-n " + "32" + " " \
				+ "-a -v " \
				+ "-U " + "'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/11.1 Safari/605.1.15'" + " " \
				+ "-o '" + self._loc_dir + "' " \
				+ self._main_page + i
			call(cmd, shell=True)

		if len(dl_list) != 0:
			abs_names = self.__abs_list
			bkup_dir = tempfile.mkdtemp(prefix="syncarch_") + "/"
			for i in abs_names:
				print("dling " + i)
				call("wget -N --progress=bar -P " + bkup_dir + " " + self._main_page + i, shell=True)

			for i in abs_names:
				print("mving " + i)
				call("mv " + bkup_dir + i + " " + self._loc_dir + i, shell=True)
			os.rmdir(bkup_dir)

			# check if we got right file lists
			tmp_list = list(set(self.__abs_list) - set(self._fl_new))
			for i in tmp_list:
				print("check for right file lists: " + self._repo_name + ": " + i)
			if len(tmp_list) != 0:
				print("error for " + self._repo_name + ": can't remove for error index.")
				return

			# remove old files
			for i in (set(self._fl_old) - set(self._fl_new)):
				print("rming " + i)
				os.remove(self._loc_dir + i)

if __name__ == '__main__':
	repos = [
        "core",
		"extra",
#		"community",
		"multilib",
#		"testing",
#		"multilib-testing",
#		"community-testing"
	]

	for repo in repos:
		print("dling repo %s:" % (repo))
		test = csub_rep(repo, "http://" + g_host + '/archlinux/' + repo + '/os/x86_64/', g_loc_base_dir + "/" + repo + "/os/x86_64/")
		test.download()