-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtests.py
55 lines (44 loc) · 1.55 KB
/
tests.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import os, re, codecs, subprocess
import shutil, stat, errno, sys, operator
import urllib.parse, html
from lxml import etree
# ---
def test1():
fp = 'e:/tools/wget/cplusplus_reference_v1/www.cplusplus.com/reference/ios/ios/index.html'
parser = etree.HTMLParser()
html_doc = etree.parse(fp, parser)
divs = html_doc.xpath('//div[@id="I_nav"]//div')
for d in divs:
at_id = d.get('id')
if at_id and at_id != 'reference_box' and at_id != 'I_subnav':
links = d.xpath('.//a')
print(str(len(links)))
# ---
def test():
ref_path = 'e:/tools/wget/cplusplus_reference/www.cplusplus.com/reference/'
for root, dirs, files in os.walk(ref_path):
for f in files:
ext = os.path.splitext(f)[1]
if 'htm' in ext:
parser = etree.HTMLParser()
fp = root + '/' + f
# fp = fp.replace('/','\\')
fp = fp.replace('\\','/')
html_doc = etree.parse(fp, parser)
links = html_doc.xpath('//a[contains(@href,"http:")]')
# print(str(len(links)))
if len(links):
print('\n-- ' + fp)
for link in links:
href = link.get("href")
if not web_ref_prefix in href:
continue
rel_href = href.replace(web_ref_prefix, '')
rel_fp = fp.replace(ref_path, '')
updir_count = len(re.findall('/', rel_fp))
rel_href = '../'*updir_count + rel_href
text = link.xpath("string()")
print(' {0}\n {1}\n {2}'.format(href, rel_href, text))
# return
# -------------------
test()