ここでは、Pythonを使用してローカルに保存されたHTMLファイルから聖書の詩を抽出しています。実際、このPythonは私が書いたものではなく、どこかから来たものです。聖書の詩を抽出するためにPythonコードを実行すると、次のような結果が表示されます。u'Joseph asopo atum aphan Jakob thek-longsi, alangli arjulo, \u201cLahei komat asomarlo?\u201d'
このタイプの行が多いと、ファイルに書き込むときにUnicode文字を正しくエンコードしません。端末で実行すると、Unicode文字が正しく表示されエンコードされます。誰もが問題を解決できる場合は、ここにPythonコードがあり、ここにローカルに保存されたHTMLがあります.zip形式のhtmlファイルのダウンロード これはPythonファイル。このPythonコードはフォルダの下のHTMLファイルを読み込み、example
フォルダ内に出力ファイルを生成します。example/exampleoutputs
import os, sys
import numpy as np
import array as arr
import re
from lxml import etree
from lxml import html
import urllib2
from bs4 import BeautifulSoup
import csv
import codecs
import shutil
class BIBLE_CLASS:
def __init__(self,path):
self.bookslist=[]
def readBookList(self,path): # Read the data from file
file=open(path,'r')
for x in file:
# print(x)
self.bookslist.append(x)
#print(Bookslist)
#for i in range(1,len(self.bookslist)):
# print(self.bookslist[i])
def searchBook(self,book): # Search the book ID in the books file
#bookcode=0
for i in range(1,len(self.bookslist)):
st=str(self.bookslist[i])
#st=st.upper()
#skey=book.upper()
#print(st,'',skey)
if(st.find(book)!=-1):
temp=self.bookslist[i]
index1=temp.index(',')
#print(temp[0:index1])
bookcode=temp[0:index1]
#print(bookcode)
return bookcode
def writeStoryFile(self,path):
# Writes data into story file
filext1=['.html']
print("writeStoryFile")
dirs=os.listdir(path)
pattern=re.compile("^v")
# CREATE A FOLDER WITH PATH+OUTPUTS AS NAME
outputfile=path+"outputs"
outpath=os.path.join(path,outputfile)
if os.path.exists(outpath):
shutil.rmtree(outpath)
else:
os.mkdir(outpath)
for d in dirs: #For each directory extract stories from files
print("Directory name:",d)
btitle=d[d.rindex('_')+1:]
print(btitle)
print('Book code:',o1.searchBook(btitle)) # Book code is extracted
fullpath=os.path.join(path,d)
if os.path.isdir(fullpath):
print("Converting folder",fullpath)
files=os.listdir(fullpath)
rno=0
# GET THE BOOK CODE FROM THE FILE
btitle=d[d.rindex('_')+1:]
#print(btitle)
#print('Book code:',o1.searchBook(btitle))
bcode=o1.searchBook(btitle)
print(bcode)
# CREATE STORY LINE FILE FOR EACH FOLDER
csvfile1=outpath+"/"+d+"story.csv" # story line filename
f1 = codecs.open(csvfile1, encoding='utf-8',mode='w') # Creating story lines file
csvfile2=outpath+"/"+d+"storyverses.csv" # story line with verses files
csvfile3=outpath+"/"+d+"veses.csv" # only verses
f2 = codecs.open(csvfile2, encoding='utf-8',mode='w')
f3=codecs.open(csvfile3, encoding='utf-8',mode='w')
for f in files:
fname,fext=os.path.splitext(f)
rows=[]
if fext in filext1:
print("=================Processing the file",f,"========================")
# Process the file
print "Stories in the chapter:"
# fullpath=os.path.join(path,file)
wpath=os.path.join(fullpath,f)
text=open(wpath,"r")
# csvfile1=wpath.replace('.html','story.csv') # story line filename
html_doc=text.read()
soup = BeautifulSoup(html_doc, 'html.parser',from_encoding="utf-8")
h3s = soup.find_all('h3')
storyverses=[]
storytitles=[]
for h3 in h3s:
next_element = h3.find_next()
print next_element.text
if next_element.find('sup') is None:
print("Element is null")
required_element=next_element.find_next()
if required_element is None:
required_element=next_element.next_element.find_next()
if required_element.find('sup') is None:
break
required_element= required_element.find_next('sup')
superscript_number = str(required_element.find_next('sup').text)
# print bcode,fname,superscript_number,h3.text
print "========",superscript_number
if len(superscript_number)>2:
storyverses.append(int(superscript_number[0:1]))
sn=superscript_number[0:1]
print sn
st=int(bcode),int(fname),int(superscript_number[0:1]), str(h3.text)
else:
storyverses.append(int(superscript_number))
st=int(bcode),int(fname),int(superscript_number), (h3.text).encode('utf-8')
storytitles.append(h3.text)
print st
rows.append(st)
rno=rno+1
else:
superscript_number = str(next_element.find('sup').text)
if len(superscript_number)>2:
storyverses.append(int(superscript_number[0:1]))
sn=superscript_number[0:1]
print sn
st=int(bcode),int(fname),int(superscript_number[0:1]), str(h3.text)
else:
storyverses.append(int(superscript_number))
st=int(bcode),int(fname),int(superscript_number), str(h3.text)
storytitles.append(h3.text)
print st
rows.append(st)
rno=rno+1
#print storytitles[0]
# Write story lines into file
# f1 = codecs.open(csvfile1, encoding='utf-8',mode='w') # Creating story lines file
for row in rows:
f1.write(str(row))
f1.write("\n")
# csvfile2=wpath.replace('.html','verse.csv') # story line files
# csvfile3=wpath.replace('.html','veses.csv')
# f2=open(csvfile2,"w")
verses=[]
verseno=1
# f2 = codecs.open(csvfile2, encoding='utf-8',mode='w')
# f3=codecs.open(csvfile3, encoding='utf-8',mode='w')
#f2=codecs.open(csvfile2,mode='w')
#f3=codecs.open(csvfile3,mode='w')
print "======================================================================================"
k=0
for y in soup.findAll('span',class_=[pattern,"heading"]):
sups=y('sup')
for z in sups:
z.decompose()
if verseno in storyverses:
st1=str(rows[storyverses.index(verseno)])
k=k+1
f2.write(st1)
f2.write("\n")
verses.append(st1)
print bcode,fname, verseno,'"'+y.text+'"'
st=int(bcode),int(fname), verseno,y.text
f2.write(str(st))
f2.write("\n")
f3.write(str(st))
f3.write("\n")
verses.append(st)
# print st
else:
print bcode,fname, verseno,'"'+y.text+'"'
st=int(bcode),int(fname),verseno,y.text
f2.write(str(st))
f2.write("\n")
f3.write(str(st))
f3.write("\n")
verses.append(st)
verseno+=1
# f1.close()
# f2.close()
# f3.close()
def writeVerses(self):
print("writeVerses")
# Writes verses into verse file
def writebothVerse(self):
print("writeStoryFile")
# Writes verse with story in between
# Main program
bookspath="BookCode/books.csv"
path='example' # Change this to the actual path of the input data
o1=BIBLE_CLASS(path)
# Step 1: Read the book list from the file
o1.readBookList(bookspath)
# Step 2: Create story files
o1.writeStoryFile(path)
filext1=['.html']
dirs=os.listdir(path)