Loading data from a data file into PostgreSQL is easy with the command "copy".
Steps involved in loading data:
1] Create a table with the columns in the data file
2] Use COPY to load data.
That is simple. But the challenge arises when the data file has hundreds of columns and you want the table to be created automatically with those many columns presented in the data file.
I found this little function and modified a bit to accommodate any kind of delimiter.
Here is the simple trick to find out the number of columns in the data file (unix):
$> head -1 data_file.txt |grep -o "|" |wc -l
grep -o parameter value is the delimiter used in the data file. In the example above it is pipe (|) delimited.
Here is how to use the function in PSQL:
select load_data_file('actual_table_name', '/data1/schanna/analytics/data_file_456.txt', 188, '|');
Here is the function Code:
CREATE OR REPLACE FUNCTION schanna.load_data_file(target_table text, file_path text, col_count integer, delimiter_char text)
RETURNS void
LANGUAGE plpgsql
AS $function$
declare
--Parameters:
--target_table - Name of the table to be
--file_path - Path of the data file to be loaded
--col_count - Number of columns in the data file
--delimiter_char - Delimiter Character used in the data file
iter integer; -- dummy integer to iterate columns with
col text; -- variable to keep the column name at each iteration
col_first text; -- first column name, e.g., top left corner on a data file
begin
--Set the working schema
set schema 'schanna';
-- Create a temporary table to load the data. This will be renamed at the end with the target_table parameter value
create table temp_table ();
-- Loop thru the number of columns in the data file and add columns to the temporary table
for iter in 1..col_count
loop
execute format('alter table temp_table add column col_%s text;', iter);
end loop;
-- Load data from data file
if delimiter_char = ',' then
execute format('copy temp_table from %L with delimiter '','' quote ''"'' csv ', file_path);
elsif delimiter_char = '|' then
execute format('copy temp_table from %L with delimiter ''|'' ', file_path);
end if;
iter := 1;
col_first := (select col_1 from temp_table limit 1);
-- update the column names based on the first row which has the column names
for col in execute format('select unnest(string_to_array(trim(temp_table::text, ''()''), '','')) from temp_table where col_1 = %L', col_first)
loop
execute format('alter table temp_table rename column col_%s to %s', iter, col);
iter := iter + 1;
end loop;
-- delete the First row which contains the column header
execute format('delete from temp_table where %s = %L', col_first, col_first);
-- rename temp table to the name given as parameter, if not blank
if length(target_table) > 0 then
execute format('alter table temp_table rename to %I', target_table);
end if;
end;
$function$
Steps involved in loading data:
1] Create a table with the columns in the data file
2] Use COPY to load data.
That is simple. But the challenge arises when the data file has hundreds of columns and you want the table to be created automatically with those many columns presented in the data file.
I found this little function and modified a bit to accommodate any kind of delimiter.
Here is the simple trick to find out the number of columns in the data file (unix):
$> head -1 data_file.txt |grep -o "|" |wc -l
grep -o parameter value is the delimiter used in the data file. In the example above it is pipe (|) delimited.
Here is how to use the function in PSQL:
select load_data_file('actual_table_name', '/data1/schanna/analytics/data_file_456.txt', 188, '|');
CREATE OR REPLACE FUNCTION schanna.load_data_file(target_table text, file_path text, col_count integer, delimiter_char text)
RETURNS void
LANGUAGE plpgsql
AS $function$
declare
--Parameters:
--target_table - Name of the table to be
--file_path - Path of the data file to be loaded
--col_count - Number of columns in the data file
--delimiter_char - Delimiter Character used in the data file
iter integer; -- dummy integer to iterate columns with
col text; -- variable to keep the column name at each iteration
col_first text; -- first column name, e.g., top left corner on a data file
begin
--Set the working schema
set schema 'schanna';
-- Create a temporary table to load the data. This will be renamed at the end with the target_table parameter value
create table temp_table ();
-- Loop thru the number of columns in the data file and add columns to the temporary table
for iter in 1..col_count
loop
execute format('alter table temp_table add column col_%s text;', iter);
end loop;
-- Load data from data file
if delimiter_char = ',' then
execute format('copy temp_table from %L with delimiter '','' quote ''"'' csv ', file_path);
elsif delimiter_char = '|' then
execute format('copy temp_table from %L with delimiter ''|'' ', file_path);
end if;
iter := 1;
col_first := (select col_1 from temp_table limit 1);
-- update the column names based on the first row which has the column names
for col in execute format('select unnest(string_to_array(trim(temp_table::text, ''()''), '','')) from temp_table where col_1 = %L', col_first)
loop
execute format('alter table temp_table rename column col_%s to %s', iter, col);
iter := iter + 1;
end loop;
-- delete the First row which contains the column header
execute format('delete from temp_table where %s = %L', col_first, col_first);
-- rename temp table to the name given as parameter, if not blank
if length(target_table) > 0 then
execute format('alter table temp_table rename to %I', target_table);
end if;
end;
$function$
No comments:
Post a Comment