Pandas to PostgreSQL using Psycopg2: copy_from()

June 21, 2020 Comments Off Coding Databases Pandas-PostgreSQL Python

As you can see at the end of my benchmark post, the 3 acceptable ways (performance wise) to do a bulk insert in Psycopg2 are

execute_values() – view post
execute_mogrify() – view post
copy_from()

This post provides an end-to-end working code for the copy_from() option. There are two ways to do it

save your dataframe to do disk and load it to your SQL table, or
save your dataframe as an in-memory StringIO object and load it directly to disk

Here is how to do both.

Option 1: saving the dataframe to disk first

import psycopg2
import os

# Connection parameters
param_dic = {
    "host"      : "localhost",
    "database"  : "globaldata",
    "user"      : "myuser",
    "password"  : "Passw0rd"
}

def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn


def copy_from_file(conn, df, table):
    """
    Here we are going save the dataframe on disk as 
    a csv file, load the csv file  
    and use copy_from() to copy it to the table
    """
    # Save the dataframe to disk
    tmp_df = "./tmp_dataframe.csv"
    df.to_csv(tmp_df, index_label='id', header=False)
    f = open(tmp_df, 'r')
    cursor = conn.cursor()
    try:
        cursor.copy_from(f, table, sep=",")
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        os.remove(tmp_df)
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("copy_from_file() done")
    cursor.close()
    os.remove(tmp_df)

    
#-----------------------------------------------
# Main code
#-----------------------------------------------
conn = connect(param_dic) # connect to the database
copy_from_file(conn, df, 'MonthlyTemp') # copy the dataframe to SQL
conn.close() # close the connection

Option 2: Saving the dataframe to memory using StringIO

import psycopg2
from io import StringIO

# Connection parameters
param_dic = {
    "host"      : "localhost",
    "database"  : "globaldata",
    "user"      : "myuser",
    "password"  : "Passw0rd"
}

def connect(params_dic):
    """ Connect to the PostgreSQL database server """
    conn = None
    try:
        # connect to the PostgreSQL server
        print('Connecting to the PostgreSQL database...')
        conn = psycopg2.connect(**params_dic)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error)
        sys.exit(1) 
    print("Connection successful")
    return conn


def copy_from_stringio(conn, df, table):
    """
    Here we are going save the dataframe in memory 
    and use copy_from() to copy it to the table
    """
    # save dataframe to an in memory buffer
    buffer = StringIO()
    df.to_csv(buffer, index_label='id', header=False)
    buffer.seek(0)
    
    cursor = conn.cursor()
    try:
        cursor.copy_from(buffer, table, sep=",")
        conn.commit()
    except (Exception, psycopg2.DatabaseError) as error:
        
        print("Error: %s" % error)
        conn.rollback()
        cursor.close()
        return 1
    print("copy_from_stringio() done")
    cursor.close()

    
#-----------------------------------------------
# Main code
#-----------------------------------------------
conn = connect(param_dic) # connect to the database
copy_from_stringio(conn, df, 'MonthlyTemp') # copy the dataframe to SQL
conn.close() # close the connection

Conclusion

As you can see in the benchmark, both flavors of copy_from() outperform every other bulk insert method in terms of speed. I would prefer to use the StringIO option, since it does not require you to create (and then delete) a temporary file on disk.

For a fully functioning tutorial on how to replicate this, please refer to my Jupyter notebook on GitHub.