As you can see at the end of my benchmark post, the 3 acceptable ways (performance wise) to do a bulk insert in Psycopg2 are
This post provides an end-to-end working code for the copy_from() option. There are two ways to do it
- save your dataframe to do disk and load it to your SQL table, or
- save your dataframe as an in-memory StringIO object and load it directly to disk
Here is how to do both.
Option 1: saving the dataframe to disk first
import psycopg2
import os
# Connection parameters
param_dic = {
"host" : "localhost",
"database" : "globaldata",
"user" : "myuser",
"password" : "Passw0rd"
}
def connect(params_dic):
""" Connect to the PostgreSQL database server """
conn = None
try:
# connect to the PostgreSQL server
print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(**params_dic)
except (Exception, psycopg2.DatabaseError) as error:
print(error)
sys.exit(1)
print("Connection successful")
return conn
def copy_from_file(conn, df, table):
"""
Here we are going save the dataframe on disk as
a csv file, load the csv file
and use copy_from() to copy it to the table
"""
# Save the dataframe to disk
tmp_df = "./tmp_dataframe.csv"
df.to_csv(tmp_df, index_label='id', header=False)
f = open(tmp_df, 'r')
cursor = conn.cursor()
try:
cursor.copy_from(f, table, sep=",")
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
os.remove(tmp_df)
print("Error: %s" % error)
conn.rollback()
cursor.close()
return 1
print("copy_from_file() done")
cursor.close()
os.remove(tmp_df)
#-----------------------------------------------
# Main code
#-----------------------------------------------
conn = connect(param_dic) # connect to the database
copy_from_file(conn, df, 'MonthlyTemp') # copy the dataframe to SQL
conn.close() # close the connection
Option 2: Saving the dataframe to memory using StringIO
import psycopg2
from io import StringIO
# Connection parameters
param_dic = {
"host" : "localhost",
"database" : "globaldata",
"user" : "myuser",
"password" : "Passw0rd"
}
def connect(params_dic):
""" Connect to the PostgreSQL database server """
conn = None
try:
# connect to the PostgreSQL server
print('Connecting to the PostgreSQL database...')
conn = psycopg2.connect(**params_dic)
except (Exception, psycopg2.DatabaseError) as error:
print(error)
sys.exit(1)
print("Connection successful")
return conn
def copy_from_stringio(conn, df, table):
"""
Here we are going save the dataframe in memory
and use copy_from() to copy it to the table
"""
# save dataframe to an in memory buffer
buffer = StringIO()
df.to_csv(buffer, index_label='id', header=False)
buffer.seek(0)
cursor = conn.cursor()
try:
cursor.copy_from(buffer, table, sep=",")
conn.commit()
except (Exception, psycopg2.DatabaseError) as error:
print("Error: %s" % error)
conn.rollback()
cursor.close()
return 1
print("copy_from_stringio() done")
cursor.close()
#-----------------------------------------------
# Main code
#-----------------------------------------------
conn = connect(param_dic) # connect to the database
copy_from_stringio(conn, df, 'MonthlyTemp') # copy the dataframe to SQL
conn.close() # close the connection
Conclusion
As you can see in the benchmark, both flavors of copy_from() outperform every other bulk insert method in terms of speed. I would prefer to use the StringIO option, since it does not require you to create (and then delete) a temporary file on disk.
For a fully functioning tutorial on how to replicate this, please refer to my Jupyter notebook on GitHub.