一、各种OJ
- 题库 - codeforces:https://codeforces.com/problemset
- 题库 - 洛谷:https://www.luogu.com.cn/problem/list
- 题库 - HydroOJ:https://hydro.ac/d/bzoj/p
- 题库 - vjudge:https://vjudge.net/problem
二、数据下载
三、题面爬取
洛谷题面爬取
import re
import urllib.request,urllib.error
import bs4
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
baseUrl = "https://www.luogu.com.cn/problem/P"
savePath ="D:\\信息学竞赛\\LuoguProblem\\"
def main():
pNum = input("请输入题目编号:")
html = getHTML(baseUrl+pNum)
if html == "noFind":
print("没有该题目")
else:
problemMD = getMD(html)
print("爬取转换完成!\nmd为:\n")
print(problemMD)
saveData(problemMD,"P"+pNum+".md")
def getHTML(url):
headers = {
"user-agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 85.0.4183.121 Safari / 537.36"
}
request = urllib.request.Request(url = url,headers = headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
#添加没找到提示
if str(html).find("NotFoundHttpException") == -1: #洛谷中没找到该题目的提示网页中会有该字样
return html
else:
return "noFind"
def getMD(html):
bs = bs4.BeautifulSoup(html,"html.parser")
core = bs.select("article")[0]
md = str(core)
md = re.sub("<h1>","# ",md)
md = re.sub("<h2>","## ",md)
md = re.sub("<h3>","#### ",md)
md = re.sub("<pre><code>","``` \n",md)
md = re.sub("</code></pre>","\n``` ",md)
md = re.sub("</?[a-zA-Z]+[^<>]*>","",md)
return md
def saveData(data,filename):
cfilename = savePath + filename
file = open(cfilename,"w",encoding="utf-8")
for d in data:
file.writelines(d)
file.close()
if __name__ == '__main__':
main()
版权属于:PCsky
本文链接:http://hyouka.club/index.php/archives/197/
转载时须注明出处及本声明