环境:Pycharm 2019.3 + Anaconda3 2020.02
代码
import urllib.request
import time
import re
import os
def save(page_id, title, en_title, college, info, source):
with open('2017210213.txt', 'a') as f1:
try:
info = info.encode('gbk').decode('gbk')
except UnicodeEncodeError:
info = 'gbk无法编码'
key = 'id:{}\ntitle:{}\nen_title:{}\ncollege:{}\ninfo:{}\nsource:{}\n\n' \
.format(page_id, title, en_title, college, info, source)
key_gbk = key.encode('gbk').decode('gbk')
f1.write(key_gbk)
print(page_id, 'save')
def get_page(page_id):
opener = urllib.request.build_opener()
opener.add_headers = [
("accept-encoding", "identity;q=1,*;q=0"),
("accept-language", "zh-CN,zh;q=0.9"),
('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36')
]
urllib.request.install_opener(opener)
url = 'https://win.bupt.edu.cn/program.do?id='+str(page_id)
response = urllib.request.urlopen(url)
time.sleep(3)
text = response.read().decode('utf-8')
print(page_id, 'got')
return text
def find(page_id, text):
reg = r'<h2 style="display:inline">(.*?)</h2>.*?' \
r'<h3 style="display:inline;"> (.*?)</h3>.*?' \
r'<div style="font-size:17px;line-height:25px;">.*?' \
r'<div style="margin-top:-7px;overflow: hidden;white-space: nowrap;text-overflow: ellipsis;">(.*?)</div>.*?' \
r'<div style="font-size:17px;line-height:25px;">(.*?)</div>'
pattern = re.compile(reg, re.S)
key = pattern.findall(text)
pattern_s = re.compile(r'var eval_score.*?name":"(.*?)\\u8bad\\u7ec3\\u8ba1\\u5212', re.S)
s = pattern_s.findall(text)
if key:
title = key[0][0].strip()
en_title = key[0][1].strip()
college = key[0][2].strip()
info = key[0][3].replace('\n', ' ').replace('\r', ' ').strip()
source = s[0].strip().encode('utf-8').decode('unicode_escape')
save(page_id, title, en_title, college, info, source)
else:
print(page_id, 'not exist')
return page_id
if os.path.exists('2017210213.txt'):
with open('2017210213.txt', 'r')as f:
lines = f.readlines()
last = lines[-6]
last_id = re.findall(r'\d+', last)[0]
print('begin from', last_id)
else:
last_id = 1200
for i in range(int(last_id)+1, 1414):
page = get_page(i)
find(i, page)
后记
这个作业因为在写入txt文件时,多了一个换行符被扣了10分,black同学告诉我群里说过,我太阳痿了。
Black同学又出现了!てぇてぇ!