プログラミング

Colaboratoryの出力をHTMLで出す

2023年11月29日 by 河副太智 Leave a Comment

Google Colaboratoryでは、Pythonのコードを実行した結果をHTMLで表示させることができます。そのためには、IPython.displayというモジュールをインポートして、HTMLという関数を使います12。例えば、以下のようなコードを実行すると、HTMLで「こんにちは」と表示されます。

from IPython.display import HTML
HTML('<p>こんにちは</p>')

1 2	from IPython.display import HTML HTML('<p>こんにちは</p>')

また、別にhtmlファイルを用意してそれを呼び出すことも可能

<!DOCTYPE html>
<html lang="ja">
<head>
  <meta charset="UTF-8">
  <title>テスト</title>
</head>
<body>
  <h1>これはテストです</h1>
  <p>Google ColaboratoryでHTMLを表示させることができます</p>
</body>
</html>

<!DOCTYPE html>

<head>

</head>

<body>

<h1>これはテストです</h1>

<p>Google ColaboratoryでHTMLを表示させることができます</p>

</body>

</html>

で

from IPython.display import HTML
HTML(filename="test.html")

1 2	from IPython.display import HTML HTML(filename="test.html")

翻訳ツールGoogleトランスの代わり

2023年1月4日 by 河副太智 Leave a Comment

Googleトランスがなぜか使えないので大体ツールを使用

pip install deep-translator

1	pip install deep-translator

でdeeptranslatorをインストール

from deep_translator import GoogleTranslator
translated = GoogleTranslator(source='auto',target='en').translate("요소로")

print(translated)

from deep_translator import GoogleTranslator

translated = GoogleTranslator(source='auto',target='en').translate("요소로")

print(translated)

>>as an element

PDFファイル操作

2022年11月26日 by 河副太智 Leave a Comment

PDFの文字列をテキストに

from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LAParams
from io import StringIO

# 標準組込み関数open()でモード指定をbinaryでFileオブジェクトを取得
fp = open("files/Training-Manual-for-Harmonized-Tariff-Schedule-Hts-Classification.pdf", 'rb')

# 出力先をPythonコンソールするためにIOストリームを取得
outfp = StringIO()


# 各種テキスト抽出に必要なPdfminer.sixのオブジェクトを取得する処理

rmgr = PDFResourceManager() # PDFResourceManagerオブジェクトの取得
lprms = LAParams()          # LAParamsオブジェクトの取得
device = TextConverter(rmgr, outfp, laparams=lprms)    # TextConverterオブジェクトの取得
iprtr = PDFPageInterpreter(rmgr, device) # PDFPageInterpreterオブジェクトの取得

# PDFファイルから1ページずつ解析(テキスト抽出)処理する
for page in PDFPage.get_pages(fp):
    iprtr.process_page(page)

text = outfp.getvalue()  # Pythonコンソールへの出力内容を取得

outfp.close()  # I/Oストリームを閉じる
device.close() # TextConverterオブジェクトの解放
fp.close()     #  Fileストリームを閉じる

print(text)  # Jupyterの出力ボックスに表示する

from pdfminer.pdfinterp import PDFResourceManager

from pdfminer.converter import TextConverter

from pdfminer.pdfinterp import PDFPageInterpreter

from pdfminer.pdfpage import PDFPage

from pdfminer.layout import LAParams

from io import StringIO

# 標準組込み関数open()でモード指定をbinaryでFileオブジェクトを取得

fp = open("files/Training-Manual-for-Harmonized-Tariff-Schedule-Hts-Classification.pdf", 'rb')

# 出力先をPythonコンソールするためにIOストリームを取得

outfp = StringIO()

# 各種テキスト抽出に必要なPdfminer.sixのオブジェクトを取得する処理

rmgr = PDFResourceManager() # PDFResourceManagerオブジェクトの取得

lprms = LAParams() # LAParamsオブジェクトの取得

device = TextConverter(rmgr, outfp, laparams=lprms) # TextConverterオブジェクトの取得

iprtr = PDFPageInterpreter(rmgr, device) # PDFPageInterpreterオブジェクトの取得

# PDFファイルから1ページずつ解析(テキスト抽出)処理する

for page in PDFPage.get_pages(fp):

iprtr.process_page(page)

text = outfp.getvalue() # Pythonコンソールへの出力内容を取得

outfp.close() # I/Oストリームを閉じる

device.close() # TextConverterオブジェクトの解放

fp.close() # Fileストリームを閉じる

print(text) # Jupyterの出力ボックスに表示する

PDFの表をエクセルに変換

# Import Module 
import os
import tabula
import pandas as pd

filename= "files/Seniority List 2018 19.pdf"
  
#pdfのページ数を取得
amount = (len(tabula.read_pdf(filename, pages = 'all')))

#pdfのページ数だけまわす
for i in range(amount):
    df = tabula.read_pdf(filename, pages = 'all')[i]
    ### Convert into Excel File
    df.to_excel('files/excel' + str(i) + '.xlsx')
    
#アウトプットされたファイルの名前をリストに入れる    
filename= "excel"
filelist = os.listdir("files")
filelist = [f for f in filelist if filename in f]



# エクセルを一つずつpandasデータとして取得
each_file = []
for file in range(len(filelist)):
    #エクセルファイルの数だけforを回し、それぞれをリスト変数に入れていく
    each_file.append(pd.read_excel('files/excel' + str(file) + '.xlsx', header=None))

#リストを結合
df = pd.concat(each_file)
df.to_excel('files/total.xlsx', index=False)

# Import Module

import os

import tabula

import pandas as pd

filename= "files/Seniority List 2018 19.pdf"

#pdfのページ数を取得

amount = (len(tabula.read_pdf(filename, pages = 'all')))

#pdfのページ数だけまわす

for i in range(amount):

df = tabula.read_pdf(filename, pages = 'all')[i]

### Convert into Excel File

df.to_excel('files/excel' + str(i) + '.xlsx')

#アウトプットされたファイルの名前をリストに入れる

filename= "excel"

filelist = os.listdir("files")

filelist = [f for f in filelist if filename in f]

# エクセルを一つずつpandasデータとして取得

each_file = []

for file in range(len(filelist)):

#エクセルファイルの数だけforを回し、それぞれをリスト変数に入れていく

each_file.append(pd.read_excel('files/excel' + str(file) + '.xlsx', header=None))

#リストを結合

df = pd.concat(each_file)

df.to_excel('files/total.xlsx', index=False)

PDFの画像を抽出

# PRG1：ライブラリ設定
import fitz
import os
 
# PRG2：画像の保存先フォルダを設定
filename = 'files/dai1.pdf'
dir_name = filename.split('.')[0]
img_dir = os.path.join(os.getcwd(),dir_name) 
if os.path.isdir(img_dir) == False:
    os.mkdir(img_dir)
 
# PRG3：PDFファイルを読み込む
doc = fitz.open(filename)
 
# PRG4：画像情報を格納するリストを作成
images = []
 
# PRG5：１ページずつ画像データを取得
for page in range(len(doc)):
    images.append(doc[page].get_images())
 
# PRG6：ページ内の画像情報を順番に処理
for pageNo, image in enumerate(images):
    # PRG7：ページ内の画像情報を処理する
    if image != []:
        for i in range(len(image)):
            # PRG8：画像情報の取得
            xref = image[i][0]
            smask = image[i][1]
            if image[i][8] == 'FlateDecode':
                ext = 'png'
            elif image[i][8] == 'DCTDecode':
                ext = 'jpeg'
 
            # PRG9：マスク情報の取得と画像の再構築
            pix = fitz.Pixmap(doc.extract_image(xref)["image"])
            if smask > 0:
                mask = fitz.Pixmap(doc.extract_image(smask)["image"])
                pix = fitz.Pixmap(pix, 0) 
                pix = fitz.Pixmap(pix, mask)
 
            # PRG10：画像を保存
            img_name = os.path.join(img_dir, f'image{pageNo+1}_{i}.{ext}')
            pix.save(img_name)

# PRG1：ライブラリ設定

import fitz

import os

# PRG2：画像の保存先フォルダを設定

filename = 'files/dai1.pdf'

dir_name = filename.split('.')[0]

img_dir = os.path.join(os.getcwd(),dir_name)

if os.path.isdir(img_dir) == False:

os.mkdir(img_dir)

# PRG3：PDFファイルを読み込む

doc = fitz.open(filename)

# PRG4：画像情報を格納するリストを作成

images = []

# PRG5：１ページずつ画像データを取得

for page in range(len(doc)):

images.append(doc[page].get_images())

# PRG6：ページ内の画像情報を順番に処理

for pageNo, image in enumerate(images):

# PRG7：ページ内の画像情報を処理する

if image != []:

for i in range(len(image)):

# PRG8：画像情報の取得

xref = image[i][0]

smask = image[i][1]

if image[i][8] == 'FlateDecode':

ext = 'png'

elif image[i][8] == 'DCTDecode':

ext = 'jpeg'

# PRG9：マスク情報の取得と画像の再構築

pix = fitz.Pixmap(doc.extract_image(xref)["image"])

if smask > 0:

mask = fitz.Pixmap(doc.extract_image(smask)["image"])

pix = fitz.Pixmap(pix, 0)

pix = fitz.Pixmap(pix, mask)

# PRG10：画像を保存

img_name = os.path.join(img_dir, f'image{pageNo+1}_{i}.{ext}')

pix.save(img_name)

テーブルのみのdumpとrestore

2022年10月20日 by 河副太智 Leave a Comment

Postgresql 12.6 から Postgresql 14.5. にtest_eu3という名称のテーブルのみを移行する際にエラーが頻発したので顛末を記録していく。

テーブルのDumpは以下のコード

※(cd..でcドライブの最上位まで移動してから)cd \Program Files\PostgreSQL\12\bin(binフォルダに移動psqlではなく通常のコマンド)

pg_dump -Fc -b -U postgres -t test_eu -f C:\Users\...\Desktop\test_eu.sql rulings

1	pg_dump -Fc -b -U postgres -t test_eu -f C:\Users\...\Desktop\test_eu.sql rulings

まず、以下のエラーが出て文字化けしているので内容がつかめない。

C:\Users\...>pg_restore -v -U app_admin -d pre_rulings -t test_eu3 test_eu3.sql
pg_restore: connecting to database for restore
Password:
pg_restore: implied data-only restore
pg_restore: processing data for table "public.test_eu3"
pg_restore: while PROCESSING TOC:
pg_restore: from TOC entry 2915; 0 3097390 TABLE DATA test_eu3 postgres
pg_restore: error: could not execute query: ERROR:  ????????????"public.test_eu3"??????????s
Command was: COPY public.test_eu3 (id, "national", item_day, item_hs_all, item_hs2, item_hs4, item_hs6, item_place, image_amount, img_name_all, item_image, other_info, org_discription, org_discription2, eng_discription, eng_discription2, id2) FROM stdin;
pg_restore: warning: errors ignored on restore: 1

C:\Users\...>

C:\Users\...>pg_restore -v -U app_admin -d pre_rulings -t test_eu3 test_eu3.sql

pg_restore: connecting to database for restore

Password:

pg_restore: implied data-only restore

pg_restore: processing data for table "public.test_eu3"

pg_restore: while PROCESSING TOC:

pg_restore: from TOC entry 2915; 0 3097390 TABLE DATA test_eu3 postgres

pg_restore: error: could not execute query: ERROR: ????????????"public.test_eu3"??????????s

Command was: COPY public.test_eu3 (id, "national", item_day, item_hs_all, item_hs2, item_hs4, item_hs6, item_place, image_amount, img_name_all, item_image, other_info, org_discription, org_discription2, eng_discription, eng_discription2, id2) FROM stdin;

pg_restore: warning: errors ignored on restore: 1

C:\Users\...>

そこでpsqlで以下を入力し、英語に変換する

ALTER ROLE app_admin SET lc_messages = 'C';

1	ALTER ROLE app_admin SET lc_messages = 'C';

すると以下のようなエラーに変わった

C:\Users\enosh>pg_restore -v -U app_admin -d pre_rulings -t test_eu3 test_eu3.sql
pg_restore: connecting to database for restore
Password:
pg_restore: implied data-only restore
pg_restore: processing data for table "public.test_eu3"
pg_restore: while PROCESSING TOC:
pg_restore: from TOC entry 2915; 0 3097390 TABLE DATA test_eu3 postgres
pg_restore: error: could not execute query: ERROR:  relation "public.test_eu3" does not exist
Command was: COPY public.test_eu3 (id, "national", item_day, item_hs_all, item_hs2, item_hs4, item_hs6, item_place, image_amount, img_name_all, item_image, other_info, org_discription, org_discription2, eng_discription, eng_discription2, id2) FROM stdin;
pg_restore: warning: errors ignored on restore: 1

C:\Users\enosh>pg_restore -v -U app_admin -d pre_rulings -t test_eu3 test_eu3.sql

pg_restore: connecting to database for restore

Password:

pg_restore: implied data-only restore

pg_restore: processing data for table "public.test_eu3"

pg_restore: while PROCESSING TOC:

pg_restore: from TOC entry 2915; 0 3097390 TABLE DATA test_eu3 postgres

pg_restore: error: could not execute query: ERROR: relation "public.test_eu3" does not exist

pg_restore: warning: errors ignored on restore: 1

relation “public.test_eu3″がないとの事、
publicとはデータベース作成の際に自動で作成されるスキーマ名の事、
特に気にせず空のtest_eu3のテーブルを作成し、以下を入力

pg_restore -v -U app_admin -d pre_rulings -t test_eu test_eu.sql

1	pg_restore -v -U app_admin -d pre_rulings -t test_eu test_eu.sql

空のテーブルとdumpにあるテーブルデータのカラムが異なるとエラーになるので注意する。

これで移行が完了。

画像のスクレイピング(sslの場合)

2022年9月3日 by 河副太智 Leave a Comment

画像のスクレイピング(sslの場合)

import io
import ssl
from urllib import request

context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)

item_image = "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/180px-Cat03.jpg"
    
f = io.BytesIO(request.urlopen(item_image,context=context).read())

import io

import ssl

from urllib import request

context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)

item_image = "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/180px-Cat03.jpg"

f = io.BytesIO(request.urlopen(item_image,context=context).read())

しかしなぜか以下のエラーが出てしまう理由は不明

C:\Users\xxx\AppData\Local\Temp\ipykernel_16388\4181980830.py:5: DeprecationWarning: ssl.PROTOCOL_TLSv1_2 is deprecated
context = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)

応用例

#国名+hs6桁のフォルダ名を設定
check_folder_exsist = glob.glob("E:\\rulings_img/" + HS_jsonData)
if not check_folder_exsist:
    os.mkdir("E:\\rulings_img/" + HS_jsonData)


for g in range(image_amount):
    try:  
        driver.find_element(By.XPATH, "//*[@id=\"dtlLayer\"]/div[4]/table/tbody/tr[9]/td/span[" + str(g+1)+ "]/a/img")
        time.sleep(1)
    except Exception as e:
        print("画像取得エラー発生divの値を5に変更:", e.args)
        driver.driver.find_element(By.XPATH, "//*[@id=\"dtlLayer\"]/div[5]/table/tbody/tr[9]/td/span[" + str(g+1)+ "]/a/img")

    item_image = driver.find_element(By.XPATH, "//*[@id=\"dtlLayer\"]/div[4]/table/tbody/tr[9]/td/span[" + str(g+1)+ "]/a/img").get_attribute("src")


    #画像の取得
    try:
        f = io.BytesIO(request.urlopen(item_image,context=context).read())
    except:
        print(">>>>>>>画像取得リトライ<<<<<<<<")
        time.sleep(10)
        f = io.BytesIO(request.urlopen(item_image,context=context).read())

    #３回トライしても画像取得できない場合はNONEにする
    for image_loop in range(3):
        try:    
            img = Image.open(f)

            img_name = "E:\\rulings_img/" + HS_jsonData + "/" + link_id + "-" + str(g+1) + ".jpg"

            #画像保存時にRGBはjpgに変換できないというエラーを防ぐ
            try:
                img.save(img_name)
            except:
                print("imgで保存できない為rgbに変換")
                rgb_img = img.convert('RGB')
                rgb_img.save(img_name)


            #画像が2件以上あれば画像名と画像URLをカンマ区切りで取得
            if g > 0:
                item_image_urls = item_image_urls + "," + item_image
                img_name_all = img_name_all + ","+ img_name
            else:
                item_image_urls = item_image
                img_name_all = img_name
            break
        except:
            print("3秒停止")
            time.sleep(3)
    else:
        item_image_urls = "None"
        img_name_all = "None"
        image_amount = 0
        break
else:
item_image_urls = "None"
img_name_all = "None"   
image_amount = 0

#国名+hs6桁のフォルダ名を設定

check_folder_exsist = glob.glob("E:\\rulings_img/" + HS_jsonData)

if not check_folder_exsist:

os.mkdir("E:\\rulings_img/" + HS_jsonData)

for g in range(image_amount):

try:

driver.find_element(By.XPATH, "//*[@id=\"dtlLayer\"]/div[4]/table/tbody/tr[9]/td/span[" + str(g+1)+ "]/a/img")

time.sleep(1)

except Exception as e:

print("画像取得エラー発生divの値を5に変更:", e.args)

driver.driver.find_element(By.XPATH, "//*[@id=\"dtlLayer\"]/div[5]/table/tbody/tr[9]/td/span[" + str(g+1)+ "]/a/img")

item_image = driver.find_element(By.XPATH, "//*[@id=\"dtlLayer\"]/div[4]/table/tbody/tr[9]/td/span[" + str(g+1)+ "]/a/img").get_attribute("src")

#画像の取得

try:

f = io.BytesIO(request.urlopen(item_image,context=context).read())

except:

print(">>>>>>>画像取得リトライ<<<<<<<<")

time.sleep(10)

f = io.BytesIO(request.urlopen(item_image,context=context).read())

#３回トライしても画像取得できない場合はNONEにする

for image_loop in range(3):

try:

img = Image.open(f)

img_name = "E:\\rulings_img/" + HS_jsonData + "/" + link_id + "-" + str(g+1) + ".jpg"

#画像保存時にRGBはjpgに変換できないというエラーを防ぐ

try:

img.save(img_name)

except:

print("imgで保存できない為rgbに変換")

rgb_img = img.convert('RGB')

rgb_img.save(img_name)

#画像が2件以上あれば画像名と画像URLをカンマ区切りで取得

if g > 0:

item_image_urls = item_image_urls + "," + item_image

img_name_all = img_name_all + ","+ img_name

else:

item_image_urls = item_image

img_name_all = img_name

break

except:

print("3秒停止")

time.sleep(3)

else:

item_image_urls = "None"

img_name_all = "None"

image_amount = 0

break

else:

item_image_urls = "None"

img_name_all = "None"

image_amount = 0