In this tutorial, we take a hands-on approach to building an advanced convolutional neural network for DNA sequence classification. We focus on simulating real biological tasks, such as promoter prediction, splice site detection, and regulatory element identification. By combining one-hot encoding, multi-scale convolutional layers, and an attention mechanism, we design a model that not only learns complex motifs but also provides interpretability. As we progress, we generate synthetic data, train with robust callbacks, and visualize results to ensure we fully understand the strengths and limitations of our approach. Check out the FULL CODES here.
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import random
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)
We begin by importing the libraries for deep learning, data handling, and visualization. We set random seeds to ensure reproducibility so that our experiments run consistently each time. Check out the FULL CODES here.
class DNASequenceClassifier:
def __init__(self, sequence_length=200, num_classes=2):
self.sequence_length = sequence_length
self.num_classes = num_classes
self.model = None
self.history = None
def one_hot_encode(self, sequences):
mapping = {'A': 0, 'T': 1, 'G': 2, 'C': 3}
encoded = np.zeros((len(sequences), self.sequence_length, 4))
for i, seq in enumerate(sequences):
for j, nucleotide in enumerate(seq[:self.sequence_length]):
if nucleotide in mapping:
encoded[i, j, mapping[nucleotide]] = 1
return encoded
def attention_layer(self, inputs, name="attention"):
attention_weights = layers.Dense(1, activation='tanh', name=f"{name}_weights")(inputs)
attention_weights = layers.Flatten()(attention_weights)
attention_weights = layers.Activation('softmax', name=f"{name}_softmax")(attention_weights)
attention_weights = layers.RepeatVector(inputs.shape[-1])(attention_weights)
attention_weights = layers.Permute([2, 1])(attention_weights)
attended = layers.Multiply(name=f"{name}_multiply")([inputs, attention_weights])
return layers.GlobalMaxPooling1D()(attended)
def build_model(self):
inputs = layers.Input(shape=(self.sequence_length, 4), name="dna_input")
conv_layers = []
filter_sizes = [3, 7, 15, 25]
for i, filter_size in enumerate(filter_sizes):
conv = layers.Conv1D(
filters=64,
kernel_size=filter_size,
activation='relu',
padding='same',
name=f"conv_{filter_size}"
)(inputs)
conv = layers.BatchNormalization(name=f"bn_conv_{filter_size}")(conv)
conv = layers.Dropout(0.2, name=f"dropout_conv_{filter_size}")(conv)
attended = self.attention_layer(conv, name=f"attention_{filter_size}")
conv_layers.append(attended)
if len(conv_layers) > 1:
merged = layers.Concatenate(name="concat_multiscale")(conv_layers)
else:
merged = conv_layers[0]
dense = layers.Dense(256, activation='relu', name="dense_1")(merged)
dense = layers.BatchNormalization(name="bn_dense_1")(dense)
dense = layers.Dropout(0.5, name="dropout_dense_1")(dense)
dense = layers.Dense(128, activation='relu', name="dense_2")(dense)
dense = layers.BatchNormalization(name="bn_dense_2")(dense)
dense = layers.Dropout(0.3, name="dropout_dense_2")(dense)
if self.num_classes == 2:
outputs = layers.Dense(1, activation='sigmoid', name="output")(dense)
loss = 'binary_crossentropy'
metrics = ['accuracy', 'precision', 'recall']
else:
outputs = layers.Dense(self.num_classes, activation='softmax', name="output")(dense)
loss = 'categorical_crossentropy'
metrics = ['accuracy']
self.model = keras.Model(inputs=inputs, outputs=outputs, name="DNA_CNN_Classifier")
optimizer = keras.optimizers.Adam(
learning_rate=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=1e-7
)
self.model.compile(
optimizer=optimizer,
loss=loss,
metrics=metrics
)
return self.model
def generate_synthetic_data(self, n_samples=10000):
sequences = []
labels = []
positive_motifs = ['TATAAA', 'CAAT', 'GGGCGG', 'TTGACA']
negative_motifs = ['AAAAAAA', 'TTTTTTT', 'CCCCCCC', 'GGGGGGG']
nucleotides = ['A', 'T', 'G', 'C']
for i in range(n_samples):
sequence = ''.join(random.choices(nucleotides, k=self.sequence_length))
if i < n_samples // 2:
motif = random.choice(positive_motifs)
pos = random.randint(0, self.sequence_length - len(motif))
sequence = sequence[:pos] + motif + sequence[pos + len(motif):]
label = 1
else:
if random.random() < 0.3:
motif = random.choice(negative_motifs)
pos = random.randint(0, self.sequence_length - len(motif))
sequence = sequence[:pos] + motif + sequence[pos + len(motif):]
label = 0
sequences.append(sequence)
labels.append(label)
return sequences, np.array(labels)
def train(self, X_train, y_train, X_val, y_val, epochs=50, batch_size=32):
callbacks = [
keras.callbacks.EarlyStopping(
monitor='val_loss',
patience=10,
restore_best_weights=True
),
keras.callbacks.ReduceLROnPlateau(
monitor='val_loss',
factor=0.5,
patience=5,
min_lr=1e-6
)
]
self.history = self.model.fit(
X_train, y_train,
validation_data=(X_val, y_val),
epochs=epochs,
batch_size=batch_size,
callbacks=callbacks,
verbose=1
)
return self.history
def evaluate_and_visualize(self, X_test, y_test):
y_pred_proba = self.model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()
print("Classification Report:")
print(classification_report(y_test, y_pred))
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes[0,0].plot(self.history.history['loss'], label='Training Loss')
axes[0,0].plot(self.history.history['val_loss'], label='Validation Loss')
axes[0,0].set_title('Training History - Loss')
axes[0,0].set_xlabel('Epoch')
axes[0,0].set_ylabel('Loss')
axes[0,0].legend()
axes[0,1].plot(self.history.history['accuracy'], label='Training Accuracy')
axes[0,1].plot(self.history.history['val_accuracy'], label='Validation Accuracy')
axes[0,1].set_title('Training History - Accuracy')
axes[0,1].set_xlabel('Epoch')
axes[0,1].set_ylabel('Accuracy')
axes[0,1].legend()
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', ax=axes[1,0], cmap='Blues')
axes[1,0].set_title('Confusion Matrix')
axes[1,0].set_ylabel('Actual')
axes[1,0].set_xlabel('Predicted')
axes[1,1].hist(y_pred_proba[y_test==0], bins=50, alpha=0.7, label='Negative', density=True)
axes[1,1].hist(y_pred_proba[y_test==1], bins=50, alpha=0.7, label='Positive', density=True)
axes[1,1].set_title('Prediction Score Distribution')
axes[1,1].set_xlabel('Prediction Score')
axes[1,1].set_ylabel('Density')
axes[1,1].legend()
plt.tight_layout()
plt.show()
return y_pred, y_pred_proba
We define a DNASequenceClassifier that encodes sequences, learns multi-scale motifs with CNNs, and applies an attention mechanism for interpretability. We build and compile the model, generate synthetic motif-rich data, and then train with robust callbacks and visualize performance to evaluate classification quality. Check out the FULL CODES here.
def main():
print("
Advanced DNA Sequence Classification with CNN")
print("=" * 50)
classifier = DNASequenceClassifier(sequence_length=200, num_classes=2)
print("Generating synthetic DNA sequences...")
sequences, labels = classifier.generate_synthetic_data(n_samples=10000)
print("Encoding DNA sequences...")
X = classifier.one_hot_encode(sequences)
X_trn, X_test, y_trn, y_test = train_test_split(
X, labels, test_size=0.2, random_state=42, stratify=labels
)
X_trn, X_val, y_trn, y_val = train_test_split(
X_trn, y_trn, test_size=0.2, random_state=42, stratify=y_train
)
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")
print("Building CNN model...")
model = classifier.build_model()
print(model.summary())
print("Training model...")
classifier.train(X_train, y_train, X_val, y_val, epochs=30, batch_size=64)
print("Evaluating model...")
y_pred, y_pred_proba = classifier.evaluate_and_visualize(X_test, y_test)
print("
Training and evaluation complete!")
if __name__ == "__main__":
main()
We wrap up the workflow in the main() function, where we generate synthetic DNA data, encode it, split it into training, validation, and test sets, then build, train, and evaluate our CNN model. We conclude by visualizing the performance and confirming that the classification pipeline runs successfully from start to finish.
In conclusion, we successfully demonstrate how a carefully designed CNN with attention can classify DNA sequences with high accuracy and interpretability. We see how synthetic biological motifs help validate the model’s capacity for pattern recognition, and how visualization techniques provide meaningful insights into training dynamics and predictions. Through this journey, we enhance our ability to integrate deep learning architectures with biological data, laying the groundwork for applying these methods to real-world genomics research.
Check out the FULL CODES here. Feel free to check out our GitHub Page for Tutorials, Codes and Notebooks. Also, feel free to follow us on Twitter and don’t forget to join our 100k+ ML SubReddit and Subscribe to our Newsletter.
The post Building an Advanced Convolutional Neural Network with Attention for DNA Sequence Classification and Interpretability appeared first on MarkTechPost.