Overview
In this blog, we dive into a Python function designed to streamline the process of filtering CSV files in a directory based on their column headers. This function distinguishes between files with an exact match of target columns and those that don’t, facilitating targeted file handling in data processing tasks. The approach utilizes standard libraries, demonstrating efficient coding practices while ensuring readability and maintainability.
Explanation with Inline Comments
import os # Required for directory operations
import csv # Required for handling CSV files
output_directory = 'extract'
def filter_csv_files(directory_path, target_columns):
exact_match_files = [] # Holds filenames with exactly matching target columns
other_files = [] # Holds filenames without an exact match of target columns
target_columns_set = set(target_columns) # Convert list to set for O(1) lookup
for filename in os.listdir(directory_path):
if filename.endswith('.csv'): # Filter for CSV files
file_path = os.path.join(directory_path, filename) # Full path to the file
try:
with open(file_path, mode='r', encoding='utf-8') as file:
reader = csv.reader(file)
columns = next(reader, None) # Read the header row
if columns:
columns_set = set(columns)
# Append file_path directly instead of concatenating strings
if columns_set == target_columns_set:
exact_match_files.append(file_path)
else:
other_files.append(file_path)
except Exception as e:
print(f"Error reading {filename}: {e}")
return exact_match_files, other_files
# Updated target columns to include additional identifiers
target_columns = ['Column1', 'Column2', 'Column3', 'Column4', 'Column5',
'Column6']
exact_match_files, other_files = filter_csv_files(output_directory, target_columns)
# Directly print the lists; no need for intermediate arrays
print(exact_match_files)
print('-------------- below is other_files_arr')
print(other_files)
Filtering CSV Files
import os
import csv
output_directory = 'extract'
target_columns = ['Column1', 'Column2', 'Column3', 'Column4', 'Column5', 'Column6']
Iterate with all files
Next, we define a function filter_csv_files
that takes the directory path and target columns as inputs and returns two lists: files with an exact match of target columns and those without.
def filter_csv_files(directory_path, target_columns):
exact_match_files = []
other_files = []
target_columns_set = set(target_columns)
for filename in os.listdir(directory_path):
if filename.endswith('.csv'):
file_path = os.path.join(directory_path, filename)
try:
with open(file_path, mode='r', encoding='utf-8') as file:
reader = csv.reader(file)
columns = next(reader, None)
if columns:
columns_set = set(columns)
if columns_set == target_columns_set:
exact_match_files.append(file_path)
else:
other_files.append(file_path)
except Exception as e:
print(f"Error reading {filename}: {e}")
return exact_match_files, other_files
Calling the function
Finally, we call the filter_csv_files
function with the output directory and target columns, and print the resulting lists of files.
exact_match_files, other_files = filter_csv_files(output_directory, target_columns)
print("Files with Exact Match of Target Columns:")
print(exact_match_files)
print("\nOther Files:")
print(other_files)