@@ -888,8 +888,109 @@ static int apic_set_affinity(struct irq_data *irqd,
888
888
return err ? err : IRQ_SET_MASK_OK ;
889
889
}
890
890
891
+ static void free_moved_vector (struct apic_chip_data * apicd )
892
+ {
893
+ unsigned int vector = apicd -> prev_vector ;
894
+ unsigned int cpu = apicd -> prev_cpu ;
895
+ bool managed = apicd -> is_managed ;
896
+
897
+ /*
898
+ * Managed interrupts are usually not migrated away
899
+ * from an online CPU, but CPU isolation 'managed_irq'
900
+ * can make that happen.
901
+ * 1) Activation does not take the isolation into account
902
+ * to keep the code simple
903
+ * 2) Migration away from an isolated CPU can happen when
904
+ * a non-isolated CPU which is in the calculated
905
+ * affinity mask comes online.
906
+ */
907
+ trace_vector_free_moved (apicd -> irq , cpu , vector , managed );
908
+ irq_matrix_free (vector_matrix , cpu , vector , managed );
909
+ per_cpu (vector_irq , cpu )[vector ] = VECTOR_UNUSED ;
910
+ hlist_del_init (& apicd -> clist );
911
+ apicd -> prev_vector = 0 ;
912
+ apicd -> move_in_progress = 0 ;
913
+ }
914
+
915
+ /*
916
+ * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
917
+ */
918
+ static void apic_force_complete_move (struct irq_data * irqd )
919
+ {
920
+ unsigned int cpu = smp_processor_id ();
921
+ struct apic_chip_data * apicd ;
922
+ unsigned int vector ;
923
+
924
+ guard (raw_spinlock )(& vector_lock );
925
+ apicd = apic_chip_data (irqd );
926
+ if (!apicd )
927
+ return ;
928
+
929
+ /*
930
+ * If prev_vector is empty or the descriptor is neither currently
931
+ * nor previously on the outgoing CPU no action required.
932
+ */
933
+ vector = apicd -> prev_vector ;
934
+ if (!vector || (apicd -> cpu != cpu && apicd -> prev_cpu != cpu ))
935
+ return ;
936
+
937
+ /*
938
+ * This is tricky. If the cleanup of the old vector has not been
939
+ * done yet, then the following setaffinity call will fail with
940
+ * -EBUSY. This can leave the interrupt in a stale state.
941
+ *
942
+ * All CPUs are stuck in stop machine with interrupts disabled so
943
+ * calling __irq_complete_move() would be completely pointless.
944
+ *
945
+ * 1) The interrupt is in move_in_progress state. That means that we
946
+ * have not seen an interrupt since the io_apic was reprogrammed to
947
+ * the new vector.
948
+ *
949
+ * 2) The interrupt has fired on the new vector, but the cleanup IPIs
950
+ * have not been processed yet.
951
+ */
952
+ if (apicd -> move_in_progress ) {
953
+ /*
954
+ * In theory there is a race:
955
+ *
956
+ * set_ioapic(new_vector) <-- Interrupt is raised before update
957
+ * is effective, i.e. it's raised on
958
+ * the old vector.
959
+ *
960
+ * So if the target cpu cannot handle that interrupt before
961
+ * the old vector is cleaned up, we get a spurious interrupt
962
+ * and in the worst case the ioapic irq line becomes stale.
963
+ *
964
+ * But in case of cpu hotplug this should be a non issue
965
+ * because if the affinity update happens right before all
966
+ * cpus rendezvous in stop machine, there is no way that the
967
+ * interrupt can be blocked on the target cpu because all cpus
968
+ * loops first with interrupts enabled in stop machine, so the
969
+ * old vector is not yet cleaned up when the interrupt fires.
970
+ *
971
+ * So the only way to run into this issue is if the delivery
972
+ * of the interrupt on the apic/system bus would be delayed
973
+ * beyond the point where the target cpu disables interrupts
974
+ * in stop machine. I doubt that it can happen, but at least
975
+ * there is a theoretical chance. Virtualization might be
976
+ * able to expose this, but AFAICT the IOAPIC emulation is not
977
+ * as stupid as the real hardware.
978
+ *
979
+ * Anyway, there is nothing we can do about that at this point
980
+ * w/o refactoring the whole fixup_irq() business completely.
981
+ * We print at least the irq number and the old vector number,
982
+ * so we have the necessary information when a problem in that
983
+ * area arises.
984
+ */
985
+ pr_warn ("IRQ fixup: irq %d move in progress, old vector %d\n" ,
986
+ irqd -> irq , vector );
987
+ }
988
+ free_moved_vector (apicd );
989
+ }
990
+
891
991
#else
892
- # define apic_set_affinity NULL
992
+ # define apic_set_affinity NULL
993
+ # define apic_force_complete_move NULL
893
994
#endif
894
995
895
996
static int apic_retrigger_irq (struct irq_data * irqd )
@@ -923,39 +1024,16 @@ static void x86_vector_msi_compose_msg(struct irq_data *data,
923
1024
}
924
1025
925
1026
static struct irq_chip lapic_controller = {
926
- .name = "APIC" ,
927
- .irq_ack = apic_ack_edge ,
928
- .irq_set_affinity = apic_set_affinity ,
929
- .irq_compose_msi_msg = x86_vector_msi_compose_msg ,
930
- .irq_retrigger = apic_retrigger_irq ,
1027
+ .name = "APIC" ,
1028
+ .irq_ack = apic_ack_edge ,
1029
+ .irq_set_affinity = apic_set_affinity ,
1030
+ .irq_compose_msi_msg = x86_vector_msi_compose_msg ,
1031
+ .irq_force_complete_move = apic_force_complete_move ,
1032
+ .irq_retrigger = apic_retrigger_irq ,
931
1033
};
932
1034
933
1035
#ifdef CONFIG_SMP
934
1036
935
- static void free_moved_vector (struct apic_chip_data * apicd )
936
- {
937
- unsigned int vector = apicd -> prev_vector ;
938
- unsigned int cpu = apicd -> prev_cpu ;
939
- bool managed = apicd -> is_managed ;
940
-
941
- /*
942
- * Managed interrupts are usually not migrated away
943
- * from an online CPU, but CPU isolation 'managed_irq'
944
- * can make that happen.
945
- * 1) Activation does not take the isolation into account
946
- * to keep the code simple
947
- * 2) Migration away from an isolated CPU can happen when
948
- * a non-isolated CPU which is in the calculated
949
- * affinity mask comes online.
950
- */
951
- trace_vector_free_moved (apicd -> irq , cpu , vector , managed );
952
- irq_matrix_free (vector_matrix , cpu , vector , managed );
953
- per_cpu (vector_irq , cpu )[vector ] = VECTOR_UNUSED ;
954
- hlist_del_init (& apicd -> clist );
955
- apicd -> prev_vector = 0 ;
956
- apicd -> move_in_progress = 0 ;
957
- }
958
-
959
1037
static void __vector_cleanup (struct vector_cleanup * cl , bool check_irr )
960
1038
{
961
1039
struct apic_chip_data * apicd ;
@@ -1068,99 +1146,6 @@ void irq_complete_move(struct irq_cfg *cfg)
1068
1146
__vector_schedule_cleanup (apicd );
1069
1147
}
1070
1148
1071
- /*
1072
- * Called from fixup_irqs() with @desc->lock held and interrupts disabled.
1073
- */
1074
- void irq_force_complete_move (struct irq_desc * desc )
1075
- {
1076
- unsigned int cpu = smp_processor_id ();
1077
- struct apic_chip_data * apicd ;
1078
- struct irq_data * irqd ;
1079
- unsigned int vector ;
1080
-
1081
- /*
1082
- * The function is called for all descriptors regardless of which
1083
- * irqdomain they belong to. For example if an IRQ is provided by
1084
- * an irq_chip as part of a GPIO driver, the chip data for that
1085
- * descriptor is specific to the irq_chip in question.
1086
- *
1087
- * Check first that the chip_data is what we expect
1088
- * (apic_chip_data) before touching it any further.
1089
- */
1090
- irqd = irq_domain_get_irq_data (x86_vector_domain ,
1091
- irq_desc_get_irq (desc ));
1092
- if (!irqd )
1093
- return ;
1094
-
1095
- raw_spin_lock (& vector_lock );
1096
- apicd = apic_chip_data (irqd );
1097
- if (!apicd )
1098
- goto unlock ;
1099
-
1100
- /*
1101
- * If prev_vector is empty or the descriptor is neither currently
1102
- * nor previously on the outgoing CPU no action required.
1103
- */
1104
- vector = apicd -> prev_vector ;
1105
- if (!vector || (apicd -> cpu != cpu && apicd -> prev_cpu != cpu ))
1106
- goto unlock ;
1107
-
1108
- /*
1109
- * This is tricky. If the cleanup of the old vector has not been
1110
- * done yet, then the following setaffinity call will fail with
1111
- * -EBUSY. This can leave the interrupt in a stale state.
1112
- *
1113
- * All CPUs are stuck in stop machine with interrupts disabled so
1114
- * calling __irq_complete_move() would be completely pointless.
1115
- *
1116
- * 1) The interrupt is in move_in_progress state. That means that we
1117
- * have not seen an interrupt since the io_apic was reprogrammed to
1118
- * the new vector.
1119
- *
1120
- * 2) The interrupt has fired on the new vector, but the cleanup IPIs
1121
- * have not been processed yet.
1122
- */
1123
- if (apicd -> move_in_progress ) {
1124
- /*
1125
- * In theory there is a race:
1126
- *
1127
- * set_ioapic(new_vector) <-- Interrupt is raised before update
1128
- * is effective, i.e. it's raised on
1129
- * the old vector.
1130
- *
1131
- * So if the target cpu cannot handle that interrupt before
1132
- * the old vector is cleaned up, we get a spurious interrupt
1133
- * and in the worst case the ioapic irq line becomes stale.
1134
- *
1135
- * But in case of cpu hotplug this should be a non issue
1136
- * because if the affinity update happens right before all
1137
- * cpus rendezvous in stop machine, there is no way that the
1138
- * interrupt can be blocked on the target cpu because all cpus
1139
- * loops first with interrupts enabled in stop machine, so the
1140
- * old vector is not yet cleaned up when the interrupt fires.
1141
- *
1142
- * So the only way to run into this issue is if the delivery
1143
- * of the interrupt on the apic/system bus would be delayed
1144
- * beyond the point where the target cpu disables interrupts
1145
- * in stop machine. I doubt that it can happen, but at least
1146
- * there is a theoretical chance. Virtualization might be
1147
- * able to expose this, but AFAICT the IOAPIC emulation is not
1148
- * as stupid as the real hardware.
1149
- *
1150
- * Anyway, there is nothing we can do about that at this point
1151
- * w/o refactoring the whole fixup_irq() business completely.
1152
- * We print at least the irq number and the old vector number,
1153
- * so we have the necessary information when a problem in that
1154
- * area arises.
1155
- */
1156
- pr_warn ("IRQ fixup: irq %d move in progress, old vector %d\n" ,
1157
- irqd -> irq , vector );
1158
- }
1159
- free_moved_vector (apicd );
1160
- unlock :
1161
- raw_spin_unlock (& vector_lock );
1162
- }
1163
-
1164
1149
#ifdef CONFIG_HOTPLUG_CPU
1165
1150
/*
1166
1151
* Note, this is not accurate accounting, but at least good enough to
0 commit comments