-
Notifications
You must be signed in to change notification settings - Fork 9
/
get_wikidata_lists.py
112 lines (105 loc) · 2.98 KB
/
get_wikidata_lists.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import json
import os
import re
import requests
ENDPOINT = 'https://query.wikidata.org/sparql'
PREFIX = 'random=RANDOM;all=1;keep_all=1;depth=1;url='
PREFIX_FEED = 'random=RANDOM;all=1;keep_all=1;depth=1;deep_extract=1;url='
QUERY_TEMPLATE_OLD = '''
SELECT DISTINCT ?item ?website WHERE {
{
?item p:P31 ?statement0 .
?statement0 (ps:P31/(wdt:P31*)/(wdt:P279*)) wd:{q} .
}
UNION
{
?item p:P452 ?statement1 .
?statement1 (ps:P452/(wdt:P31*)/(wdt:P279*)) wd:{q} .
}
UNION
{
?item p:P3912 ?statement1 .
?statement1 (ps:P3912/(wdt:P31*)/(wdt:P279*)) wd:{q} .
}
UNION
{
?item p:P361 ?statement1 .
?statement1 (ps:P361/(wdt:P31*)/(wdt:P279*)) wd:{q} .
}
UNION
{
?item p:P101 ?statement1 .
?statement1 (ps:P101/(wdt:P31*)/(wdt:P279*)) wd:{q} .
}
UNION
{
?item p:P127 ?statement1 .
?statement1 (ps:P127/(wdt:P31*)/(wdt:P279*)) wd:{q} .
}
?item wdt:P856 ?website .
}
'''
QUERY_TEMPLATE = '''
SELECT DISTINCT ?item ?website WHERE {
{
?item p:{p} ?statement0 .
?statement0 (ps:{p}/(wdt:P31*)/(wdt:P279*)) wd:{q} .
}
?item wdt:P856 ?website .
}
'''
P_TERMS = [
'P31',
'P452',
'P3912',
'P361',
'P101',
'P127',
'P366',
'P1269'
]
def main(directory: str = '.'):
for filename in os.listdir('.'):
if not filename.endswith('.wikidata'):
continue
print('processing', filename)
filepath = os.path.join(directory, filename)
with open(filepath, 'r') as f:
wikidata_q = f.read().strip()
sites = set()
for wikidata_p in P_TERMS:
print(wikidata_p)
query = QUERY_TEMPLATE.replace('{q}', wikidata_q).replace('{p}', wikidata_p)
while True:
try:
response = requests.get(
ENDPOINT,
params={
'query': query,
'format': 'json'
},
timeout=1000
)
assert response.status_code == 200
data = json.loads(response.content, strict=False)
break
except KeyboardInterrupt:
raise
except Exception as e:
print(str(e))
print('retrying')
print(len(data['results']['bindings']))
for d in data['results']['bindings']:
if 'website' not in d:
continue
site = d['website']['value']
if not re.search('^https?://', site, flags=re.I):
continue
if site.count('/') == 2:
site += '/'
sites.add(PREFIX + site)
sites.add(PREFIX + re.search('^(https?://+[^/]+/)', site, flags=re.I).group(1))
with open(filepath+'.txt', 'w') as f:
f.write('\n'.join(sorted(sites)))
if __name__ == '__main__':
main()