技术 |
"""
Spyder Editor
This is a temporary script file.
"""
import json
import requests as r
import os
root='wiki'
if os.path.exists(root)==False:
os.makedirs(root)
ids=0
while 0>-1:
ids=ids+1
url='http://wiki.jowei19.com/api/GetWikiInfoById?wikiId='+str(ids)
a = r.get(url)
aa=a.text
khjc = r.get('http://wiki.jowei19.com/api/GetWikiInfoById?wikiId='+str(ids+1))
kh2=khjc.text
if aa=='未找到素材:"errmsg"' and kh2==aa:
print(ids-1)
break
else:
print(ids)
if aa=='未找到素材:"errmsg"':
continue
js=json.loads(aa)
name=js.get('Title')
name=name.replace('/','')
name=name.replace(':','')
name=name.replace('>','')
name=name.replace('<','')
name=name.replace('=','')
name=name.replace(' ','')
name=name.replace('\t','')
name=name.replace('\"','')
name=name.replace('\'','')
with open('wiki/'+name+'.json','w',encoding='utf-8') as k:
json.dump(js,k,ensure_ascii=False)
if ids>=10000:
break
众所周知,给AU服务器的请求比较慢,如果两千多三千多词条,每调试一次就要爬十分钟,效率极其低下,再有耐心的肝帝也受不了。
"""
Created on Sun Jan 8 12:36:35 2023
@author: 26409
"""
import requests as r
import os
import json
root='wiki'
for dirpath , dirnames , filenames in os.walk(root):
wj=filenames
li=[]
comm=[]
for i in wj:
with open('wiki/'+i,'r',encoding='utf-8') as j:
js=json.load(j)
name=js.get('Title')
chang=js.get('Length')
bian=js.get('EditLogs')
author=js.get('Creator')
com=js.get('Comments')
ids=js.get('Id')
if type(js.get('ContainsWikiId'))==list:
yin2=js.get('ContainsWikiId')
yin3=[]
for ii in yin2:
if ii not in yin3:
yin3.append(ii)
yin=len(yin3)
else:
yin=0
if len(bian)!=0:
user=[author]#包括作者自己,即使他没编辑
for l in bian:
ren=l['User']
if ren not in user:
user.append(ren)
renshu=len(user)
else:
renshu=1#编辑人数
#编辑次数
ci=len(bian)
#评论部分
if []!=com:
for c in com:
FromUser=c['FromUser']
Content=c['Content']
pl=(name,FromUser,Content)
comm.append(pl)
pls=len(com)
plr=[]
for d in com:
FromUser=d['FromUser']
if FromUser not in plr:
plr.append(FromUser)
plrs=len(plr)
sz=(name,author,yin,chang,renshu,ci,pls,plrs,yin3,ids)
#词条名称,作者,引用数量,长度,编辑人数,编辑次数,评论数量,评论人数,引用,ID
li.append(sz)
dex=2
li3=sorted(li, key=lambda x:x[dex])
li3.reverse()
m=0
nr=''
for p in li3:
m=m+1
nr=nr+'# 第'+str(m)+'名:\n'+p[0]+'\n作者:'+p[1]+'\n引用的词条数量:'+str(p[2])+'\n'
if m==64:
with open ('引用数量排名64.txt','w',encoding='utf-8') as e:
e.write(nr)
break
dex=3
li4=sorted(li, key=lambda x:x[dex])
li4.reverse()
m=0
nr=''
for p in li4:
m=m+1
nr=nr+'# 第'+str(m)+'名:\n'+p[0]+'\n作者:'+p[1]+'\n长度:'+str(p[3])+'\n'
if m==64:
with open ('长度排名64.txt','w',encoding='utf-8') as e:
e.write(nr)
break
for l4 in li4:
if l4[1]=='binshu2233':
print(l4[0],str(l4[3]))
dex=4
li5=sorted(li, key=lambda x:x[dex])
li5.reverse()
m=0
nr=''
for p in li5:
m=m+1
nr=nr+'# 第'+str(m)+'名:\n'+p[0]+'\n编辑过它的人数:'+str(p[4])+'\n'
if m==32:
with open ('编辑人数排名32.txt','w',encoding='utf-8') as e:
e.write(nr)
break
dex=5
li6=sorted(li, key=lambda x:x[dex])
li6.reverse()
m=0
nr=''
for p in li6:
m=m+1
nr=nr+'# 第'+str(m)+'名:\n'+p[0]+'\n被编辑的次数:'+str(p[5])+'\n'
if m==32:
with open ('被编辑的次数排名32.txt','w',encoding='utf-8') as e:
e.write(nr)
break
dex=6
li7=sorted(li, key=lambda x:x[dex])
li7.reverse()
m=0
nr=''
for p in li7:
m=m+1
nr=nr+'# 第'+str(m)+'名:\n'+p[0]+'\n被评论次数:'+str(p[6])+'\n'
if m==16:
with open ('被评论次数排名16.txt','w',encoding='utf-8') as e:
e.write(nr)
break
dex=7
li8=sorted(li, key=lambda x:x[dex])
li8.reverse()
m=0
nr=''
for p in li8:
m=m+1
nr=nr+'# 第'+str(m)+'名:\n'+p[0]+'\n评论它的人数:'+str(p[7])+'\n'
if m==16:
with open ('评论它的人数排名16.txt','w',encoding='utf-8') as e:
e.write(nr)
break
comm=sorted(comm, key=lambda x:x[1])
ak = r.get('http://wiki.jowei19.com/api/GetUserList')
aak=json.loads(ak.text)
for o in aak:
yuser=o["UserName"]
upl=''
n=0
for u in comm:
if u[1]==yuser:
n=1
upl=upl+u[0]+' '+u[2]+'\n'
yu2=yuser.replace('.','点')
if n==1:
with open ('comment/'+yu2+'.txt','w',encoding='utf-8') as w:
w.write(upl)
yyh=[]
for i in li:
byy=0
ide=i[9]
for j in li:
yy=j[8]
for k in yy:
if k == ide:
byy=byy+1
yyh.append([i[0],i[9],byy])
yli9=sorted(yyh, key=lambda x:x[2])
yli9.reverse()
m=0
nr=''
for p in yli9:
m=m+1
nr=nr+'# 第'+str(m)+'名:\n'+p[0]+'\n被引用的次数:'+str(p[2])+'\n'
if m==64:
with open ('被引用次数排名64.txt','w',encoding='utf-8') as e:
e.write(nr)
break
我现在推荐把词条数据整个存储在一个列表里,然后再单独生成json,这样可以避免文件名的问题
大概原理:文件夹字数统计的大概原理
"""
Created on Mon Jan 23 19:22:20 2023
@author: 26409
"""
import json
import requests as r
import os
root='wikidir'
if os.path.exists(root)==False:
os.makedirs(root)
ids=0
while 0>-1:
ids=ids+1
url='http://wiki.jowei19.com/api/GetWikiDirInfoById?dirId='+str(ids)
a = r.get(url)
aa=a.text
khjc = r.get('http://wiki.jowei19.com/api/GetWikiInfoById?wikiId='+str(ids+1))
kh2=khjc.text
print(len(aa))
if len(aa)>1234:
continue
if aa=='未找到素材:"errmsg"' and kh2==aa:
print(ids-1)
break
else:
print(ids)
if aa=='未找到素材:"errmsg"':
continue
js=json.loads(aa)
name=js.get('Name')
user=js.get('Creator')
sud=js.get('SubDirs')
con=js.get('ContainWikis')
e=[ids,name,user,sud,con]
with open('wikidir/'+str(ids)+'.json','w',encoding='utf-8') as xie:
st=str(e)
st=st.replace('\'','\"')
xie.write(st)
"""
Created on Mon Jan 23 20:00:54 2023
@author: 26409
"""
import json
import requests as r
import sys
import os
for dirpath , dirnames , filenames in os.walk('wikidir'):
wj=filenames
data=[]
dataw=[]
for i in wj:
with open('wikidir/'+i,'r',encoding='utf-8') as j:
js=json.load(j)
data.append(js)
for dirpath , dirnames , filenames in os.walk('wiki'):
wj2=filenames
for p in wj2:
with open('wiki/'+p,'r',encoding='utf-8') as ja:
jss=json.load(ja)
dataw.append(jss)
wiki=[]
for q in dataw:
idi=q['Id']
long=q['Length']
wiki.append([idi,long])#id,长
def dirs(wjj,chong,ci):#int wjj
length=0
for d in data:
if d[0]==wjj:
e=d
break
if e[3]==[]:
#没有子文件夹
length=length
chong=chong
else:
for f in e[3]:
length=length+(dirs(f,chong,ci))[0]
chong=chong+(dirs(f,chong,ci))[1]
chong2=[]
for ch in chong:
if ch not in chong2:
chong2.append(ch)
chong=chong2
ci=1+(dirs(f,chong,ci))[2]
#直接wiki
for l in e[4]:
for m in wiki:
if m[0]==l:
if l not in chong:
length=length+m[1]
chong.append(l)
return([length,chong,ci])
jg=[]
for h in data:
zs=(dirs(h[0],[],0))[0]
jg.append((h[0],zs))
jg2=(sorted(jg, key=lambda x:x[1]))
jg2.reverse()
io=int(input('文件夹id'))
for gj in jg2:
if gj[0]==io:
print(str(gj[1])+'字数')
我现在推荐把词条数据整个存储在一个列表里,然后再单独生成json,这样可以避免文件名的问题
技术 |