Python: Simplifying CSV File Filtering

Overview

In this blog, we dive into a Python function designed to streamline the process of filtering CSV files in a directory based on their column headers. This function distinguishes between files with an exact match of target columns and those that don’t, facilitating targeted file handling in data processing tasks. The approach utilizes standard libraries, demonstrating efficient coding practices while ensuring readability and maintainability.

Explanation with Inline Comments

import os  # Required for directory operations
import csv  # Required for handling CSV files

output_directory = 'extract'

def filter_csv_files(directory_path, target_columns):
    exact_match_files = []  # Holds filenames with exactly matching target columns
    other_files = []  # Holds filenames without an exact match of target columns

    target_columns_set = set(target_columns)  # Convert list to set for O(1) lookup

    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):  # Filter for CSV files
            file_path = os.path.join(directory_path, filename)  # Full path to the file
            try:
                with open(file_path, mode='r', encoding='utf-8') as file:
                    reader = csv.reader(file)
                    columns = next(reader, None)  # Read the header row
                    if columns:
                        columns_set = set(columns)
                        # Append file_path directly instead of concatenating strings
                        if columns_set == target_columns_set:
                            exact_match_files.append(file_path)
                        else:
                            other_files.append(file_path)
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    return exact_match_files, other_files

# Updated target columns to include additional identifiers
target_columns = ['Column1', 'Column2', 'Column3', 'Column4', 'Column5', 
                  'Column6']

exact_match_files, other_files = filter_csv_files(output_directory, target_columns)

# Directly print the lists; no need for intermediate arrays
print(exact_match_files)
print('-------------- below is other_files_arr')
print(other_files)

Filtering CSV Files

import os
import csv

output_directory = 'extract'
target_columns = ['Column1', 'Column2', 'Column3', 'Column4', 'Column5', 'Column6']

Iterate with all files

Next, we define a function filter_csv_files that takes the directory path and target columns as inputs and returns two lists: files with an exact match of target columns and those without.

def filter_csv_files(directory_path, target_columns):
    exact_match_files = []
    other_files = []
    target_columns_set = set(target_columns)

    for filename in os.listdir(directory_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory_path, filename)
            try:
                with open(file_path, mode='r', encoding='utf-8') as file:
                    reader = csv.reader(file)
                    columns = next(reader, None)
                    if columns:
                        columns_set = set(columns)
                        if columns_set == target_columns_set:
                            exact_match_files.append(file_path)
                        else:
                            other_files.append(file_path)
            except Exception as e:
                print(f"Error reading {filename}: {e}")

    return exact_match_files, other_files

Calling the function

Finally, we call the filter_csv_files function with the output directory and target columns, and print the resulting lists of files.

exact_match_files, other_files = filter_csv_files(output_directory, target_columns)

print("Files with Exact Match of Target Columns:")
print(exact_match_files)
print("\nOther Files:")
print(other_files)